diff --git a/.coveragerc b/.coveragerc index 9a07159daeb8c..83805150083e5 100644 --- a/.coveragerc +++ b/.coveragerc @@ -24,4 +24,3 @@ omit = airflow/migrations/* airflow/www/node_modules/** airflow/www_rbac/node_modules/** - airflow/_vendor/* diff --git a/.dockerignore b/.dockerignore index e7d656456dc59..8a90d745cfbe3 100644 --- a/.dockerignore +++ b/.dockerignore @@ -31,10 +31,21 @@ !common !dags !dev +!chart !docs !licenses -!scripts +!metastore_browser + +# Add those folders to the context so that they are available in the CI container +!scripts/in_container +!scripts/docker + +# Add backport packages to the context +!backport_packages + +# Add tests and kubernetes_tests to context. !tests +!kubernetes_tests !.coveragerc !.rat-excludes @@ -42,14 +53,16 @@ !.dockerignore !pytest.ini !CHANGELOG.txt -!Dockerfile.ci -!Dockerfile !LICENSE !MANIFEST.in !NOTICE !.github -!requirements -!entrypoint.sh +!empty + +# This folder is for you if you want to add any packages to the docker context when you build your own +# docker image. most of other files and any new folder you add will be excluded by default +# if you need other types of files - please add the extensions here. +!docker-context-files # Avoid triggering context change on README change (new companies using Airflow) # So please do not uncomment this line ;) diff --git a/.flake8 b/.flake8 index 4a2ca5b7e1a17..099ff70f8bc03 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,5 @@ [flake8] max-line-length = 110 ignore = E731,W504,I001,W503 -exclude = .svn,CVS,.bzr,.hg,.git,__pycache__,.eggs,*.egg,*/_vendor/*,node_modules +exclude = .svn,CVS,.bzr,.hg,.git,__pycache__,.eggs,*.egg,node_modules format = ${cyan}%(path)s${reset}:${yellow_bold}%(row)d${reset}:${green_bold}%(col)d${reset}: ${red_bold}%(code)s${reset} %(text)s diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000..d872017df781b --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +/chart export-ignore diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 64625e449bd9d..1e3c23d5ad347 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,10 +1,23 @@ -- [ ] Description above provides context of the change -- [ ] Commit message contains [\[AIRFLOW-XXXX\]](https://issues.apache.org/jira/browse/AIRFLOW-XXXX) or `[AIRFLOW-XXXX]` for document-only changes -- [ ] Unit tests coverage for changes (not needed for documentation changes) -- [ ] Commits follow "[How to write a good git commit message](http://chris.beams.io/posts/git-commit/)" -- [ ] Relevant documentation is updated including usage instructions. -- [ ] I will engage committers as explained in [Contribution Workflow Example](https://github.com/apache/airflow/blob/master/CONTRIBUTING.rst#contribution-workflow-example). + + +--- +**^ Add meaningful description above** + +Read the **[Pull Request Guidelines](https://github.com/apache/airflow/blob/master/CONTRIBUTING.rst#pull-request-guidelines)** for more information. In case of fundamental code change, Airflow Improvement Proposal ([AIP](https://cwiki.apache.org/confluence/display/AIRFLOW/Airflow+Improvements+Proposals)) is needed. In case of a new dependency, check compliance with the [ASF 3rd Party License Policy](https://www.apache.org/legal/resolved.html#category-x). In case of backwards incompatible changes please leave a note in [UPDATING.md](https://github.com/apache/airflow/blob/master/UPDATING.md). -Read the [Pull Request Guidelines](https://github.com/apache/airflow/blob/master/CONTRIBUTING.rst#pull-request-guidelines) for more information. diff --git a/.github/actions/cancel-workflow-runs b/.github/actions/cancel-workflow-runs new file mode 160000 index 0000000000000..953e057dc81d3 --- /dev/null +++ b/.github/actions/cancel-workflow-runs @@ -0,0 +1 @@ +Subproject commit 953e057dc81d3458935a18d1184c386b0f6b5738 diff --git a/.github/actions/checks-action b/.github/actions/checks-action new file mode 160000 index 0000000000000..9f02872da71b6 --- /dev/null +++ b/.github/actions/checks-action @@ -0,0 +1 @@ +Subproject commit 9f02872da71b6f558c6a6f190f925dde5e4d8798 diff --git a/.github/actions/codecov-action b/.github/actions/codecov-action new file mode 160000 index 0000000000000..1fc7722ded470 --- /dev/null +++ b/.github/actions/codecov-action @@ -0,0 +1 @@ +Subproject commit 1fc7722ded4708880a5aea49f2bfafb9336f0c8d diff --git a/.github/actions/configure-aws-credentials b/.github/actions/configure-aws-credentials new file mode 160000 index 0000000000000..e97d7fbc8e0e5 --- /dev/null +++ b/.github/actions/configure-aws-credentials @@ -0,0 +1 @@ +Subproject commit e97d7fbc8e0e5af69631c13daa0f4b5a8d88165b diff --git a/.github/actions/get-workflow-origin b/.github/actions/get-workflow-origin new file mode 160000 index 0000000000000..588cc14f9f1cd --- /dev/null +++ b/.github/actions/get-workflow-origin @@ -0,0 +1 @@ +Subproject commit 588cc14f9f1cdf1b8be3db816855e96422204fec diff --git a/.github/actions/github-push-action b/.github/actions/github-push-action new file mode 160000 index 0000000000000..40bf560936a80 --- /dev/null +++ b/.github/actions/github-push-action @@ -0,0 +1 @@ +Subproject commit 40bf560936a8022e68a3c00e7d2abefaf01305a6 diff --git a/.github/actions/label-when-approved-action b/.github/actions/label-when-approved-action new file mode 160000 index 0000000000000..4c5190fec5661 --- /dev/null +++ b/.github/actions/label-when-approved-action @@ -0,0 +1 @@ +Subproject commit 4c5190fec5661e98d83f50bbd4ef9ebb48bd1194 diff --git a/.github/workflows/build-images-workflow-run.yml b/.github/workflows/build-images-workflow-run.yml new file mode 100644 index 0000000000000..5c85cb44f0735 --- /dev/null +++ b/.github/workflows/build-images-workflow-run.yml @@ -0,0 +1,427 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +--- +name: "Build Images" +on: # yamllint disable-line rule:truthy + workflow_run: + workflows: ["CI Build"] + types: ['requested'] +env: + MOUNT_LOCAL_SOURCES: "false" + FORCE_ANSWER_TO_QUESTIONS: "yes" + FORCE_PULL_IMAGES: "true" + CHECK_IMAGE_FOR_REBUILD: "true" + SKIP_CHECK_REMOTE_IMAGE: "true" + DB_RESET: "true" + VERBOSE: "true" + USE_GITHUB_REGISTRY: "true" + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_USERNAME: ${{ github.actor }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REGISTRY_PULL_IMAGE_TAG: "latest" + GITHUB_REGISTRY_WAIT_FOR_IMAGE: "false" + BUILD_IMAGES: ${{ secrets.AIRFLOW_GITHUB_REGISTRY_WAIT_FOR_IMAGE != 'false' }} + +jobs: + + cancel-workflow-runs: + timeout-minutes: 10 + name: "Cancel workflow runs" + runs-on: ubuntu-20.04 + outputs: + sourceHeadRepo: ${{ steps.source-run-info.outputs.sourceHeadRepo }} + sourceHeadBranch: ${{ steps.source-run-info.outputs.sourceHeadBranch }} + sourceHeadSha: ${{ steps.source-run-info.outputs.sourceHeadSha }} + mergeCommitSha: ${{ steps.source-run-info.outputs.mergeCommitSha }} + targetCommitSha: ${{ steps.source-run-info.outputs.targetCommitSha }} + pullRequestNumber: ${{ steps.source-run-info.outputs.pullRequestNumber }} + pullRequestLabels: ${{ steps.source-run-info.outputs.pullRequestLabels }} + targetBranch: ${{ steps.source-run-info.outputs.targetBranch }} + sourceEvent: ${{ steps.source-run-info.outputs.sourceEvent }} + cacheDirective: ${{ steps.cache-directive.outputs.docker-cache }} + buildImages: ${{ steps.build-images.outputs.buildImages }} + steps: + - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" + uses: actions/checkout@v2 + with: + persist-credentials: false + submodules: recursive + - name: "Get information about the original trigger of the run" + uses: ./.github/actions/get-workflow-origin + id: source-run-info + with: + token: ${{ secrets.GITHUB_TOKEN }} + sourceRunId: ${{ github.event.workflow_run.id }} + - name: "Cancel duplicated 'CI Build' runs" + uses: ./.github/actions/cancel-workflow-runs + with: + token: ${{ secrets.GITHUB_TOKEN }} + cancelMode: allDuplicates + sourceRunId: ${{ github.event.workflow_run.id }} + - name: "Output BUILD_IMAGES" + id: build-images + run: | + # Workaround - jobs cannot access env variable in "ifs" + # https://github.community/t/how-to-set-and-access-a-workflow-variable/17335/16 + echo "::set-output name=buildImages::${BUILD_IMAGES}" + - name: "Cancel duplicated 'Build Image' runs" + # We find duplicates of our own "Build Image" runs - due to a missing feature + # in GitHub Actions, we have to use Job names to match Event/Repo/Branch matching + # trick ¯\_(ツ)_/¯. We name the build-info job appropriately + # and then we try to find and cancel all the jobs with the same Event + Repo + Branch as the + # current Event/Repo/Branch combination. + uses: ./.github/actions/cancel-workflow-runs + with: + cancelMode: namedJobs + token: ${{ secrets.GITHUB_TOKEN }} + notifyPRCancel: true + jobNameRegexps: > + [".*Event: ${{ steps.source-run-info.outputs.sourceEvent }} + Repo: ${{ steps.source-run-info.outputs.sourceHeadRepo }} + Branch: ${{ steps.source-run-info.outputs.sourceHeadBranch }}.*"] + if: env.BUILD_IMAGES == 'true' + - name: "Cancel all 'CI Build' runs where some jobs failed" + # We find any of the "CI Build" workflow runs, where any of the important jobs + # failed. The important jobs are selected by the regexp array below. + # We also produce list of canceled "CI Build' runs as output, so that we + # can cancel all the matching "Build Images" workflow runs in the two following steps. + # Yeah. Adding to the complexity ¯\_(ツ)_/¯. + uses: ./.github/actions/cancel-workflow-runs + id: cancel-failed + with: + token: ${{ secrets.GITHUB_TOKEN }} + cancelMode: failedJobs + sourceRunId: ${{ github.event.workflow_run.id }} + notifyPRCancel: true + jobNameRegexps: > + ["^Static checks.*", "^Build docs$", "^Spell check docs$", "^Backport packages$", + "^Checks: Helm tests$", "^Test OpenAPI*"] + - name: "Extract canceled failed runs" + # We use this step to build regexp that will be used to match the Source Run id in + # the build-info job below. If we cancelled some "CI Build" runs in the "cancel-failed' step + # above - we want to cancel also the corresponding "Build Images" runs. Again we have + # to match the jobs using job name rather than use proper API because that feature + # is currently missing in GitHub Actions ¯\_(ツ)_/¯. + id: extract-cancelled-failed-runs + if: steps.cancel-failed.outputs.cancelledRuns != '[]' + run: | + REGEXP="Source Run id: " + SEPARATOR="" + for run_id in $(echo "${{ steps.cancel-failed.outputs.cancelledRuns }}" | jq '.[]') + do + REGEXP="${REGEXP}${SEPARATOR}(${run_id})" + SEPARATOR="|" + done + echo "::set-output name=matching-regexp::[\"${REGEXP}\"]" + - name: "Cancel triggered 'Build Images' runs for the cancelled failed runs" + # In case we do have some cancelled jobs in the "cancel-failed" step above + # We take the extracted regexp array prepared in the previous step and we use + # it to cancel any jobs that have matching names containing Source Run Id: + # followed by one of the run ids. Yes I know it's super complex ¯\_(ツ)_/¯. + if: env.BUILD_IMAGES == 'true' && steps.cancel-failed.outputs.cancelledRuns != '[]' + uses: ./.github/actions/cancel-workflow-runs + with: + cancelMode: namedJobs + token: ${{ secrets.GITHUB_TOKEN }} + notifyPRCancel: true + jobNameRegexps: ${{ steps.extract-cancelled-failed-runs.outputs.matching-regexp }} + - name: "Cancel duplicated 'CodeQL' runs" + uses: ./.github/actions/cancel-workflow-runs + id: cancel + with: + token: ${{ secrets.GITHUB_TOKEN }} + cancelMode: allDuplicates + workflowFileName: 'codeql-analysis.yml' + - name: "Set Docker Cache Directive" + id: cache-directive + run: | + if [[ ${{ steps.source-run-info.outputs.sourceEvent }} == 'schedule' ]]; then + echo "::set-output name=docker-cache::disabled" + else + echo "::set-output name=docker-cache::pulled" + fi + - name: "Cancel all duplicated 'Build Image' runs" + # We find duplicates of all "Build Image" runs - due to a missing feature + # in GitHub Actions, we have to use Job names to match Event/Repo/Branch matching + # trick ¯\_(ツ)_/¯. We name the build-info job appropriately and then we try to match + # all the jobs with the same Event + Repo + Branch match and cancel all the duplicates for those + # This might cancel own run, so this is the last step in the job + uses: ./.github/actions/cancel-workflow-runs + with: + cancelMode: allDuplicatedNamedJobs + token: ${{ secrets.GITHUB_TOKEN }} + notifyPRCancel: true + selfPreservation: false + jobNameRegexps: '["Event: \\S* Repo: \\S* Branch: \\S* "]' + + build-info: + # The name is such long because we are using it to cancel duplicated 'Build Images' runs + # by matching Event/Repo/Branch. This is a workaround for a missing feature of GitHub + # Actions to link the source workflow run and the triggered workflow_run one. + # We are also cancelling SourceRunId in case we determine that we should cancel the source + # Run because of some failing jobs in the source run. Again ¯\_(ツ)_/¯. + name: > + Event: ${{ needs.cancel-workflow-runs.outputs.sourceEvent }} + Repo: ${{ needs.cancel-workflow-runs.outputs.sourceHeadRepo }} + Branch: ${{ needs.cancel-workflow-runs.outputs.sourceHeadBranch }} + Run id: ${{ github.run_id }} + Source Run id: ${{ github.event.workflow_run.id }} + Sha: ${{ github.sha }} + Source Sha: ${{ needs.cancel-workflow-runs.outputs.sourceHeadSha }} + Merge commit Sha: ${{ needs.cancel-workflow-runs.outputs.mergeCommitSha }} + Target commit Sha: ${{ needs.cancel-workflow-runs.outputs.targetCommitSha }} + runs-on: ubuntu-20.04 + needs: [cancel-workflow-runs] + env: + GITHUB_CONTEXT: ${{ toJson(github) }} + outputs: + pythonVersions: ${{ steps.selective-checks.python-versions }} + upgradeToLatestConstraints: ${{ steps.selective-checks.outputs.upgrade-to-latest-constraints }} + allPythonVersions: ${{ steps.selective-checks.outputs.all-python-versions }} + defaultPythonVersion: ${{ steps.selective-checks.outputs.default-python-version }} + run-tests: ${{ steps.selective-checks.outputs.run-tests }} + run-kubernetes-tests: ${{ steps.selective-checks.outputs.run-kubernetes-tests }} + image-build: ${{ steps.selective-checks.outputs.image-build }} + if: > + needs.cancel-workflow-runs.outputs.buildImages == 'true' + steps: + # First fetch the sha of merge commit in case it is pull request so that we can + # Run selective tests + - name: > + Fetch merge commit ${{ github.ref }} ( ${{ github.sha }}: + merge_commit ${{ needs.cancel-workflow-runs.outputs.mergeCommitSha }} ) + uses: actions/checkout@v2 + with: + ref: ${{ needs.cancel-workflow-runs.outputs.mergeCommitSha }} + fetch-depth: 2 + if: needs.cancel-workflow-runs.outputs.sourceEvent == 'pull_request' + - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" + uses: actions/checkout@v2 + with: + persist-credentials: false + - name: > + Event: ${{ needs.cancel-workflow-runs.outputs.sourceEvent }} + Repo: ${{ needs.cancel-workflow-runs.outputs.sourceHeadRepo }} + Branch: ${{ needs.cancel-workflow-runs.outputs.sourceHeadBranch }} + Run id: ${{ github.run_id }} + Source Run id: ${{ github.event.workflow_run.id }} + Sha: ${{ github.sha }} + Source Sha: ${{ needs.cancel-workflow-runs.outputs.sourceHeadSha }} + Merge commit Sha: ${{ needs.cancel-workflow-runs.outputs.mergeCommitSha }} + Target commit Sha: ${{ needs.cancel-workflow-runs.outputs.targetCommitSha }} + run: printenv + - name: > + Fetch incoming commit ${{ needs.cancel-workflow-runs.outputs.targetCommitSha }} with its parent + uses: actions/checkout@v2 + with: + ref: ${{ needs.cancel-workflow-runs.outputs.targetCommitSha }} + fetch-depth: 2 + persist-credentials: false + if: needs.cancel-workflow-runs.outputs.sourceEvent == 'pull_request' + # checkout the master version again, to use the right script in master workflow + - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" + uses: actions/checkout@v2 + with: + persist-credentials: false + - name: Selective checks + id: selective-checks + env: + EVENT_NAME: ${{ needs.cancel-workflow-runs.outputs.sourceEvent }} + TARGET_COMMIT_SHA: ${{ needs.cancel-workflow-runs.outputs.targetCommitSha }} + PR_LABELS: ${{ needs.cancel-workflow-runs.outputs.pullRequestLabels }} + run: | + if [[ ${EVENT_NAME} == "pull_request" ]]; then + # Run selective checks + ./scripts/ci/selective_ci_checks.sh "${TARGET_COMMIT_SHA}" + else + # Run all checks + ./scripts/ci/selective_ci_checks.sh + fi + + build-images: + timeout-minutes: 80 + name: "Build ${{matrix.image-type}} images ${{matrix.python-version}}" + runs-on: ubuntu-20.04 + needs: [build-info, cancel-workflow-runs] + strategy: + matrix: + # We need to attempt to build all possible versions here because workflow_run + # event is run from master for both master and v1-10-tests + python-version: ${{ fromJson(needs.build-info.outputs.allPythonVersions) }} + image-type: [CI, PROD] + fail-fast: true + if: > + needs.build-info.outputs.image-build == 'true' && + needs.cancel-workflow-runs.outputs.buildImages == 'true' + env: + BACKEND: postgres + PYTHON_MAJOR_MINOR_VERSION: ${{ matrix.python-version }} + GITHUB_REGISTRY_PUSH_IMAGE_TAG: ${{ github.event.workflow_run.id }} + UPGRADE_TO_LATEST_CONSTRAINTS: ${{ needs.build-info.outputs.upgradeToLatestConstraints }} + DOCKER_CACHE: ${{ needs.cancel-workflow-runs.outputs.cacheDirective }} + steps: + - name: > + Checkout [${{ needs.cancel-workflow-runs.outputs.sourceEvent }}] + Event: ${{ needs.cancel-workflow-runs.outputs.sourceEvent }} + Repo: ${{ needs.cancel-workflow-runs.outputs.sourceHeadRepo }} + Branch: ${{ needs.cancel-workflow-runs.outputs.sourceHeadBranch }} + Run id: ${{ github.run_id }} + Source Run id: ${{ github.event.workflow_run.id }} + Sha: ${{ github.sha }} + Source Sha: ${{ needs.cancel-workflow-runs.outputs.sourceHeadSha }} + Merge commit Sha: ${{ needs.cancel-workflow-runs.outputs.mergeCommitSha }} + Target commit Sha: ${{ needs.cancel-workflow-runs.outputs.targetCommitSha }} + uses: actions/checkout@v2 + with: + ref: ${{ needs.cancel-workflow-runs.outputs.targetCommitSha }} + persist-credentials: false + - name: "Retrieve DEFAULTS from the _initialization.sh" + # We cannot "source" the script here because that would be a security problem (we cannot run + # any code that comes from the sources coming from the PR. Therefore we extract the + # DEFAULT_BRANCH and DEFAULT_CONSTRAINTS_BRANCH via custom grep/awk/sed commands + # Also 2.7 and 3.5 versions are not allowed to proceed on master + id: defaults + run: | + DEFAULT_BRANCH=$(grep "export DEFAULT_BRANCH" scripts/ci/libraries/_initialization.sh | \ + awk 'BEGIN{FS="="} {print $3}' | sed s'/["}]//g') + echo "DEFAULT_BRANCH=${DEFAULT_BRANCH}" >> $GITHUB_ENV + DEFAULT_CONSTRAINTS_BRANCH=$(grep "export DEFAULT_CONSTRAINTS_BRANCH" \ + scripts/ci/libraries/_initialization.sh | \ + awk 'BEGIN{FS="="} {print $3}' | sed s'/["}]//g') + echo "DEFAULT_CONSTRAINTS_BRANCH=${DEFAULT_CONSTRAINTS_BRANCH}" >> $GITHUB_ENV + if [[ \ + ${DEFAULT_BRANCH} != "master" || \ + ( ${PYTHON_MAJOR_MINOR_VERSION} != "2.7" && ${PYTHON_MAJOR_MINOR_VERSION} != "3.5" ) \ + ]]; then + echo "::set-output name=proceed::true" + else + echo "::set-output name=proceed::false" + fi + - name: > + Checkout "${{ needs.cancel-workflow-runs.outputs.targetBranch }}" branch to 'main-airflow' folder + to use ci/scripts from there. + uses: actions/checkout@v2 + with: + path: "main-airflow" + ref: "${{ needs.cancel-workflow-runs.outputs.targetBranch }}" + persist-credentials: false + submodules: recursive + if: steps.defaults.outputs.proceed == 'true' + - name: Initiate Github Checks for Building image + # Use the submodule from main, not the PR branch + uses: ./main-airflow/.github/actions/checks-action + id: build-image-check + with: + token: ${{ secrets.GITHUB_TOKEN }} + name: "Status of image build ${{ matrix.image-type }}: ${{ matrix.python-version }}" + status: "in_progress" + sha: ${{ needs.cancel-workflow-runs.outputs.sourceHeadSha }} + details_url: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + output: > + {"summary": + "Building the image: ${{ matrix.image-type }}: ${{ matrix.python-version }}. See the + [Image Build](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) + for details" } + if: steps.defaults.outputs.proceed == 'true' + - name: "Setup python" + uses: actions/setup-python@v2 + with: + python-version: 3.6 + if: steps.defaults.outputs.proceed == 'true' + - name: > + Override "scripts/ci" with the "${{ needs.cancel-workflow-runs.outputs.targetBranch }}" branch + so that the PR does not override it + # We should not override those scripts which become part of the image as they will not be + # changed in the image built - we should only override those that are executed to build + # the image. + run: | + rm -rf "scripts/ci" + mv "main-airflow/scripts/ci" "scripts" + if: steps.defaults.outputs.proceed == 'true' + - name: "Free space" + run: ./scripts/ci/tools/ci_free_space_on_ci.sh + if: steps.defaults.outputs.proceed == 'true' + - name: "Build CI images ${{ matrix.python-version }}:${{ github.event.workflow_run.id }}" + run: ./scripts/ci/images/ci_prepare_ci_image_on_ci.sh + # locally built CI image is needed to prepare packages for PROD image build + if: steps.defaults.outputs.proceed == 'true' + - name: "Push CI images ${{ matrix.python-version }}:${{ github.event.workflow_run.id }}" + run: ./scripts/ci/images/ci_push_ci_images.sh + if: matrix.image-type == 'CI' && steps.defaults.outputs.proceed == 'true' + - name: "Build PROD images ${{ matrix.python-version }}:${{ github.event.workflow_run.id }}" + run: ./scripts/ci/images/ci_prepare_prod_image_on_ci.sh + if: matrix.image-type == 'PROD' && steps.defaults.outputs.proceed == 'true' + - name: "Push PROD images ${{ matrix.python-version }}:${{ github.event.workflow_run.id }}" + run: ./scripts/ci/images/ci_push_production_images.sh + if: matrix.image-type == 'PROD' && steps.defaults.outputs.proceed == 'true' + - name: Update Github Checks for Building image with status + uses: ./main-airflow/.github/actions/checks-action + if: always() && steps.defaults.outputs.proceed == 'true' + with: + token: ${{ secrets.GITHUB_TOKEN }} + check_id: ${{ steps.build-image-check.outputs.check_id }} + status: "completed" + sha: ${{ needs.cancel-workflow-runs.outputs.sourceHeadSha }} + conclusion: ${{ job.status }} + details_url: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + output: > + {"summary": + "Building the image: ${{ matrix.image-type }}: ${{ matrix.python-version }}. See the + [Image Build](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) + for details" } + + cancel-on-build-cancel: + name: "Cancel 'CI Build' jobs on build image cancelling." + runs-on: ubuntu-20.04 + if: cancelled() + needs: [build-images] + steps: + - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" + uses: actions/checkout@v2 + with: + persist-credentials: false + submodules: recursive + - name: "Canceling the 'CI Build' source workflow in case of failure!" + uses: ./.github/actions/cancel-workflow-runs + with: + token: ${{ secrets.GITHUB_TOKEN }} + cancelMode: self + notifyPRCancel: true + notifyPRCancelMessage: "Building image for the PR has been cancelled" + sourceRunId: ${{ github.event.workflow_run.id }} + + cancel-on-build-failure: + name: "Cancel 'CI Build' jobs on build image failing." + runs-on: ubuntu-20.04 + if: failure() + needs: [build-images] + steps: + - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" + uses: actions/checkout@v2 + with: + persist-credentials: false + submodules: recursive + - name: "Canceling the 'CI Build' source workflow in case of failure!" + uses: ./.github/actions/cancel-workflow-runs + with: + token: ${{ secrets.GITHUB_TOKEN }} + cancelMode: self + notifyPRCancel: true + notifyPRCancelMessage: | + Building images for the PR has failed. Follow the the workflow link to check the reason. + sourceRunId: ${{ github.event.workflow_run.id }} diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 0000000000000..8bdd809b5d19e --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,110 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +--- +name: "CodeQL" + +on: # yamllint disable-line rule:truthy + push: + branches: [master] + schedule: + - cron: '0 2 * * *' + +jobs: + selective-checks: + name: Selective checks + runs-on: ubuntu-20.04 + outputs: + needs-python-scans: ${{ steps.selective-checks.outputs.needs-python-scans }} + needs-javascript-scans: ${{ steps.selective-checks.outputs.needs-javascript-scans }} + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + fetch-depth: 2 + persist-credentials: false + - name: Selective checks + id: selective-checks + env: + EVENT_NAME: ${{ github.event_name }} + TARGET_COMMIT_SHA: ${{ github.sha }} + run: | + if [[ ${EVENT_NAME} == "pull_request" ]]; then + # Run selective checks + ./scripts/ci/selective_ci_checks.sh "${TARGET_COMMIT_SHA}" + else + # Run all checks + ./scripts/ci/selective_ci_checks.sh + fi + + analyze: + name: Analyze + runs-on: ubuntu-20.04 + needs: [selective-checks] + strategy: + fail-fast: false + matrix: + # Override automatic language detection by changing the below list + # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python'] + language: ['python', 'javascript'] + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + # We must fetch at least the immediate parents so that if this is + # a pull request then we can checkout the head. + fetch-depth: 2 + persist-credentials: false + if: | + matrix.language == 'python' && needs.selective-checks.outputs.needs-python-scans == 'true' || + matrix.language == 'javascript' && needs.selective-checks.outputs.needs-javascript-scans == 'true' + + # If this run was triggered by a pull request event, then checkout + # the head of the pull request instead of the merge commit. + - run: git checkout HEAD^2 + if: | + github.event_name == 'pull_request' && + (matrix.language == 'python' && needs.selective-checks.outputs.needs-python-scans == 'true' || + matrix.language == 'javascript' && needs.selective-checks.outputs.needs-javascript-scans == 'true') + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v1 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + # queries: ./path/to/local/query, your-org/your-repo/queries@main + if: | + matrix.language == 'python' && needs.selective-checks.outputs.needs-python-scans == 'true' || + matrix.language == 'javascript' && needs.selective-checks.outputs.needs-javascript-scans == 'true' + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v1 + if: | + matrix.language == 'python' && needs.selective-checks.outputs.needs-python-scans == 'true' || + matrix.language == 'javascript' && needs.selective-checks.outputs.needs-javascript-scans == 'true' + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v1 + if: | + matrix.language == 'python' && needs.selective-checks.outputs.needs-python-scans == 'true' || + matrix.language == 'javascript' && needs.selective-checks.outputs.needs-javascript-scans == 'true' diff --git a/scripts/ci/minikdc.properties b/.github/workflows/label_when_reviewed.yml similarity index 73% rename from scripts/ci/minikdc.properties rename to .github/workflows/label_when_reviewed.yml index c70ff8448bf6d..5095953def137 100644 --- a/scripts/ci/minikdc.properties +++ b/.github/workflows/label_when_reviewed.yml @@ -1,4 +1,3 @@ -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -15,13 +14,16 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# +--- +name: Label when reviewed +on: pull_request_review # yamllint disable-line rule:truthy + +jobs: -org.name=TEST -org.domain=LOCAL -kdc.bind.address=localhost -kdc.port=8888 -instance=DefaultKrbServer -max.ticket.lifetime=86400000 -max.renewable.lifetime=604800000 -transport=TCP -debug=true + label-when-reviewed: + name: "Label PRs when reviewed" + runs-on: ubuntu-20.04 + steps: + - name: "Do nothing. Only trigger corresponding workflow_run event" + run: echo diff --git a/.github/workflows/label_when_reviewed_workflow_run.yml b/.github/workflows/label_when_reviewed_workflow_run.yml new file mode 100644 index 0000000000000..1ed50dd95f10e --- /dev/null +++ b/.github/workflows/label_when_reviewed_workflow_run.yml @@ -0,0 +1,171 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +--- +name: Label when reviewed workflow run +on: # yamllint disable-line rule:truthy + workflow_run: + workflows: ["Label when reviewed"] + types: ['requested'] +jobs: + + label-when-reviewed: + name: "Label PRs when reviewed workflow run" + runs-on: ubuntu-20.04 + outputs: + labelSet: ${{ steps.label-when-reviewed.outputs.labelSet }} + steps: + - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" + uses: actions/checkout@v2 + with: + persist-credentials: false + submodules: recursive + - name: "Get information about the original trigger of the run" + uses: ./.github/actions/get-workflow-origin + id: source-run-info + with: + token: ${{ secrets.GITHUB_TOKEN }} + sourceRunId: ${{ github.event.workflow_run.id }} + - name: Initiate Selective Build check + uses: ./.github/actions/checks-action + id: selective-build-check + with: + token: ${{ secrets.GITHUB_TOKEN }} + name: "Selective build check" + status: "in_progress" + sha: ${{ steps.source-run-info.outputs.sourceHeadSha }} + details_url: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + output: > + {"summary": + "Checking selective status of the build in + [the run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) + "} + - name: > + Event: ${{ steps.source-run-info.outputs.sourceEvent }} + Repo: ${{ steps.source-run-info.outputs.sourceHeadRepo }} + Branch: ${{ steps.source-run-info.outputs.sourceHeadBranch }} + Run id: ${{ github.run_id }} + Source Run id: ${{ github.event.workflow_run.id }} + Sha: ${{ github.sha }} + Source Sha: ${{ steps.source-run-info.outputs.sourceHeadSha }} + Merge commit Sha: ${{ steps.source-run-info.outputs.mergeCommitSha }} + Target commit Sha: ${{ steps.source-run-info.outputs.targetCommitSha }} + run: printenv + - name: > + Fetch incoming commit ${{ steps.source-run-info.outputs.targetCommitSha }} with its parent + uses: actions/checkout@v2 + with: + ref: ${{ steps.source-run-info.outputs.targetCommitSha }} + fetch-depth: 2 + persist-credentials: false + # checkout the master version again, to use the right script in master workflow + - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" + uses: actions/checkout@v2 + with: + persist-credentials: false + submodules: recursive + - name: Selective checks + id: selective-checks + env: + EVENT_NAME: ${{ steps.source-run-info.outputs.sourceEvent }} + TARGET_COMMIT_SHA: ${{ steps.source-run-info.outputs.targetCommitSha }} + PR_LABELS: ${{ steps.source-run-info.outputs.pullRequestLabels }} + run: | + if [[ ${EVENT_NAME} == "pull_request_review" ]]; then + # Run selective checks + ./scripts/ci/selective_ci_checks.sh "${TARGET_COMMIT_SHA}" + else + # Run all checks + ./scripts/ci/selective_ci_checks.sh + fi + - name: "Label when approved by committers for PRs that require full tests" + uses: ./.github/actions/label-when-approved-action + id: label-full-test-prs-when-approved-by-commiters + if: > + steps.selective-checks.outputs.run-tests == 'true' && + contains(steps.selective-checks.outputs.test-types, 'Core') + with: + token: ${{ secrets.GITHUB_TOKEN }} + label: 'full tests needed' + require_committers_approval: 'true' + pullRequestNumber: ${{ steps.source-run-info.outputs.pullRequestNumber }} + comment: > + The PR most likely needs to run full matrix of tests because it modifies parts of the core + of Airflow. However, committers might decide to merge it quickly and take the risk. + If they don't merge it quickly - please rebase it to the latest master at your convenience, + or amend the last commit of the PR, and push it with --force-with-lease. + - name: "Initiate GitHub Check forcing rerun of SH ${{ github.event.pull_request.head.sha }}" + uses: ./.github/actions/checks-action + id: full-test-check + if: steps.label-full-test-prs-when-approved-by-commiters.outputs.labelSet == 'true' + with: + token: ${{ secrets.GITHUB_TOKEN }} + name: "Please rebase or amend, and force push the PR to run full tests" + status: "in_progress" + sha: ${{ steps.source-run-info.outputs.sourceHeadSha }} + details_url: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + output: > + {"summary": + "The PR likely needs to run all tests! This was determined via selective check in + [the run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) + "} + - name: "Label when approved by committers for PRs that do not require full tests" + uses: ./.github/actions/label-when-approved-action + id: label-simple-test-prs-when-approved-by-commiters + if: > + steps.selective-checks.outputs.run-tests == 'true' && + ! contains(steps.selective-checks.outputs.test-types, 'Core') + with: + token: ${{ secrets.GITHUB_TOKEN }} + label: 'okay to merge' + require_committers_approval: 'true' + pullRequestNumber: ${{ steps.source-run-info.outputs.pullRequestNumber }} + comment: > + The PR is likely OK to be merged with just subset of tests for default Python and Database + versions without running the full matrix of tests, because it does not modify the core of + Airflow. If the committers decide that the full tests matrix is needed, they will add the label + 'full tests needed'. Then you should rebase to the latest master or amend the last commit + of the PR, and push it with --force-with-lease. + - name: "Label when approved by committers for PRs that do not require tests at all" + uses: ./.github/actions/label-when-approved-action + id: label-no-test-prs-when-approved-by-commiters + if: steps.selective-checks.outputs.run-tests != 'true' + with: + token: ${{ secrets.GITHUB_TOKEN }} + label: 'okay to merge' + pullRequestNumber: ${{ steps.source-run-info.outputs.pullRequestNumber }} + require_committers_approval: 'true' + comment: > + The PR is likely ready to be merged. No tests are needed as no important environment files, + nor python files were modified by it. However, committers might decide that full test matrix is + needed and add the 'full tests needed' label. Then you should rebase it to the latest master + or amend the last commit of the PR, and push it with --force-with-lease. + - name: Update Selective Build check + uses: ./.github/actions/checks-action + if: always() + with: + token: ${{ secrets.GITHUB_TOKEN }} + check_id: ${{ steps.selective-build-check.outputs.check_id }} + status: "completed" + sha: ${{ steps.source-run-info.outputs.sourceHeadSha }} + conclusion: ${{ job.status }} + details_url: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + output: > + {"summary": + "Checking selective status of the build completed in + [the run](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) + "} diff --git a/.github/workflows/scheduled_quarantined.yml b/.github/workflows/scheduled_quarantined.yml new file mode 100644 index 0000000000000..307169379c624 --- /dev/null +++ b/.github/workflows/scheduled_quarantined.yml @@ -0,0 +1,118 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +--- +name: Quarantined Build +on: # yamllint disable-line rule:truthy + schedule: + # Run quarantined builds 4 times a day to gather better quarantine stats + - cron: '12 */6 * * *' + +env: + MOUNT_LOCAL_SOURCES: "false" + FORCE_ANSWER_TO_QUESTIONS: "yes" + FORCE_PULL_IMAGES: "true" + CHECK_IMAGE_FOR_REBUILD: "true" + SKIP_CHECK_REMOTE_IMAGE: "true" + DB_RESET: "true" + VERBOSE: "true" + UPGRADE_TO_LATEST_CONSTRAINTS: false + PYTHON_MAJOR_MINOR_VERSION: 3.6 + USE_GITHUB_REGISTRY: "true" + # Since we run this build on schedule, it might be that the image has never been pushed + # Because the master merge was cancelled, so we have to rebuild the image for quarantined build + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_USERNAME: ${{ github.actor }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REGISTRY_PULL_IMAGE_TAG: "latest" + GITHUB_REGISTRY_PUSH_IMAGE_TAG: "latest" + GITHUB_REGISTRY_WAIT_FOR_IMAGE: "false" + +jobs: + + trigger-tests: + timeout-minutes: 5 + name: "Checks if tests should be run" + runs-on: ubuntu-20.04 + outputs: + run-tests: ${{ steps.trigger-tests.outputs.run-tests }} + steps: + - uses: actions/checkout@v2 + - name: "Check if tests should be run" + run: "./scripts/ci/tools/ci_check_if_tests_should_be_run.sh" + id: trigger-tests + + tests-quarantined: + timeout-minutes: 80 + name: "Quarantined tests" + runs-on: ubuntu-20.04 + continue-on-error: true + needs: [trigger-tests] + strategy: + matrix: + python-version: [3.6] + postgres-version: [9.6] + fail-fast: false + env: + BACKEND: postgres + PYTHON_MAJOR_MINOR_VERSION: ${{ matrix.python-version }} + POSTGRES_VERSION: ${{ matrix.postgres-version }} + RUN_TESTS: "true" + TEST_TYPE: Quarantined + NUM_RUNS: 20 + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + if: | + needs.trigger-tests.outputs.run-tests == 'true' || github.event_name != 'pull_request' + steps: + - uses: actions/checkout@v2 + with: + persist-credentials: false + - uses: actions/setup-python@v2 + with: + python-version: '3.7' + - name: "Set issue id for master" + if: github.ref == 'refs/heads/master' + run: | + echo "ISSUE_ID=10118" >> $GITHUB_ENV + - name: "Set issue id for v1-10-stable" + if: github.ref == 'refs/heads/v1-10-stable' + run: | + echo "ISSUE_ID=10127" >> $GITHUB_ENV + - name: "Set issue id for v1-10-test" + if: github.ref == 'refs/heads/v1-10-test' + run: | + echo "ISSUE_ID=10128" >> $GITHUB_ENV + - name: "Free space" + run: ./scripts/ci/tools/ci_free_space_on_ci.sh + - name: "Build CI image ${{ matrix.python-version }}" + run: ./scripts/ci/images/ci_prepare_ci_image_on_ci.sh + - name: "Tests" + run: ./scripts/ci/testing/ci_run_airflow_testing.sh + - uses: actions/upload-artifact@v2 + name: Upload Quarantine test results + if: always() + with: + name: 'quarantined_tests' + path: 'files/test_result.xml' + retention-days: 7 + - uses: actions/upload-artifact@v2 + name: Upload airflow logs + if: always() + with: + name: airflow-logs-quarantined-${{matrix.python-version}}-${{ matrix.postgres-version }} + path: './files/airflow_logs*' + retention-days: 7 diff --git a/.gitignore b/.gitignore index e05a7fdde7b22..efc568633f984 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,6 @@ secrets.py airflow.db unittests.db - # Airflow temporary artifacts airflow/git_version airflow/www/static/coverage/ @@ -154,8 +153,8 @@ rat-results.txt # Kubernetes generated templated files *.generated *.tar.gz -scripts/ci/in_container/kubernetes/kube/.generated/airflow.yaml -scripts/ci/in_container/kubernetes/docker/requirements.txt +scripts/ci/kubernetes/kube/.generated/airflow.yaml +scripts/ci/kubernetes/docker/requirements.txt # Node & Webpack Stuff *.entry.js @@ -191,3 +190,26 @@ log.txt* build-qcos-airflow/dags/ build-qcos-airflow/plugins/ +/backport_packages/CHANGELOG.txt + +# Docker context files +/docker-context-files +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Terraform variables +*.tfvars + +Chart.lock + +# Chart dependencies +**/charts/*.tgz + +# Might be generated when you build wheels +pip-wheel-metadata + +.pypirc diff --git a/.gitmodules b/.gitmodules index ef0149e36910b..e13d8ce224f28 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,24 @@ [submodule "plugins/default_data/data"] path = plugins/default_data/data url = https://github.com/chasers2012/qcos-base-data.git +[submodule ".github/actions/get-workflow-origin"] + path = .github/actions/get-workflow-origin + url = https://github.com/potiuk/get-workflow-origin +[submodule ".github/actions/cancel-workflow-runs"] + path = .github/actions/cancel-workflow-runs + url = https://github.com/potiuk/cancel-workflow-runs +[submodule ".github/actions/checks-action"] + path = .github/actions/checks-action + url = https://github.com/LouisBrunner/checks-action +[submodule ".github/actions/configure-aws-credentials"] + path = .github/actions/configure-aws-credentials + url = https://github.com/aws-actions/configure-aws-credentials +[submodule ".github/actions/codecov-action"] + path = .github/actions/codecov-action + url = https://github.com/codecov/codecov-action +[submodule ".github/actions/github-push-action"] + path = .github/actions/github-push-action + url = https://github.com/ad-m/github-push-action +[submodule ".github/actions/label-when-approved-action"] + path = .github/actions/label-when-approved-action + url = https://github.com/TobKed/label-when-approved-action diff --git a/.markdownlint.yml b/.markdownlint.yml new file mode 100644 index 0000000000000..dae821798ede4 --- /dev/null +++ b/.markdownlint.yml @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +--- +# MD003/heading-style/header-style +MD003: false + +# MD004/ul-style +MD004: false + +# MD007/ul-indent +MD007: false + +# MD012/no-multiple-blanks +MD012: false + +# MD013 Line length +MD013: false + +# MD014/commands-show-output +MD014: false + +# MD022/blanks-around-headings/blanks-around-headers +MD022: false + +# MD024/no-duplicate-heading/no-duplicate-header +MD024: false + +# MD026/no-trailing-punctuation +MD026: false + +# MD029/ol-prefix +MD029: false + +# MD030/list-marker-space +MD030: false + +# MD031/blanks-around-fences +MD031: false + +# MD032/blanks-around-lists +MD032: false + +# MD033/no-inline-html +MD033: false + +# MD034/no-bare-urls +MD034: false + +# MD036/no-emphasis-as-heading/no-emphasis-as-header +MD036: false + +# MD040/fenced-code-language +MD040: false + +# MD041/first-line-heading/first-line-h1 +MD041: false + +# MD045/no-alt-text +MD045: false + +# MD046/code-block-style +MD046: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 400e73d10e0d2..a98b0ea2096aa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,15 +21,19 @@ default_language_version: python: python3 minimum_pre_commit_version: "1.20.0" repos: + - repo: meta + hooks: + - id: identity + - id: check-hooks-apply - repo: https://github.com/Lucas-C/pre-commit-hooks - rev: v1.1.7 + rev: v1.1.9 hooks: - id: forbid-tabs - exclude: ^airflow/_vendor/.*$|^docs/Makefile$ + exclude: ^docs/Makefile$|^clients/gen/go.sh|\.gitmodules$ - id: insert-license name: Add license for all SQL files files: \.sql$ - exclude: ^\.github/.*$|^airflow/_vendor/.*$ + exclude: ^\.github/.*$ args: - --comment-style - "/*||*/" @@ -38,7 +42,7 @@ repos: - --fuzzy-match-generates-todo - id: insert-license name: Add license for all other files - exclude: ^\.github/.*$"|^airflow/_vendor/.*$ + exclude: ^\.github/.*$ args: - --comment-style - "|#|" @@ -46,10 +50,10 @@ repos: - license-templates/LICENSE.txt - --fuzzy-match-generates-todo files: > - \.properties$|\.cfg$|\.conf$|\.ini$|\.ldif$|\.readthedocs$|\.service$|^Dockerfile.*$ + \.properties$|\.cfg$|\.conf$|\.ini$|\.ldif$|\.readthedocs$|\.service$|\.tf$|Dockerfile.*$ - id: insert-license name: Add license for all rst files - exclude: ^\.github/.*$"|^airflow/_vendor/.*$ + exclude: ^\.github/.*$ args: - --comment-style - "||" @@ -58,19 +62,19 @@ repos: - --fuzzy-match-generates-todo files: \.rst$ - id: insert-license - name: Add license for all JS/CSS files - files: \.(js|css)$ - exclude: ^\.github/.*$|^airflow/_vendor/.*$|^airflow/www/static/.*|^airflow/www_rbac/static/.*$ + name: Add license for all JS/CSS/PUML files + files: \.(js|css|puml)$ + exclude: ^\.github/.*$|^airflow/www/static/.*|^airflow/www_rbac/static/.*$ args: - --comment-style - - "/**| *| */" + - "/*!| *| */" - --license-filepath - license-templates/LICENSE.txt - --fuzzy-match-generates-todo - id: insert-license name: Add license for all JINJA template files - files: ^airflow/www/templates/.*\.html$|^docs/templates/.*\.html$|^airflow/contrib/plugins/metastore_browser/templates/.*\.html$ # yamllint disable-line rule:line-length - exclude: ^\.github/.*$|^airflow/_vendor/.*$ + files: "^airflow/www/templates/.*\\.html$|^docs/templates/.*\\.html$.*\\.jinja2" + exclude: ^\.github/.*$ args: - --comment-style - "{#||#}" @@ -79,7 +83,7 @@ repos: - --fuzzy-match-generates-todo - id: insert-license name: Add license for all shell files - exclude: ^\.github/.*$"|^airflow/_vendor/.*$ + exclude: ^\.github/.*$ files: ^breeze$|^breeze-complete$|\.sh$|\.bash$|\.bats$ args: - --comment-style @@ -89,7 +93,7 @@ repos: - --fuzzy-match-generates-todo - id: insert-license name: Add license for all python files - exclude: ^\.github/.*$"|^airflow/_vendor/.*$ + exclude: ^\.github/.*$ types: [python] args: - --comment-style @@ -99,8 +103,8 @@ repos: - --fuzzy-match-generates-todo - id: insert-license name: Add license for all XML files - exclude: ^\.github/.*$"|^airflow/_vendor/.*$ - types: [xml] + exclude: ^\.github/.*$ + files: \.xml$ args: - --comment-style - "" @@ -109,8 +113,9 @@ repos: - --fuzzy-match-generates-todo - id: insert-license name: Add license for all yaml files - exclude: ^\.github/.*$"|^airflow/_vendor/.*$ + exclude: ^\.github/.*$ types: [yaml] + files: \.yml$|\.yaml$ args: - --comment-style - "|#|" @@ -120,27 +125,33 @@ repos: - id: insert-license name: Add license for all md files files: \.md$ - exclude: ^\.github/.*$|^airflow/_vendor/.*$ + exclude: ^\.github/.*$|PROVIDER_CHANGES.*\.md args: - --comment-style - "" - --license-filepath - license-templates/LICENSE.txt - --fuzzy-match-generates-todo + - id: insert-license + name: Add license for all mermaid files + args: + - --comment-style + - "|%%|" + - --license-filepath + - license-templates/LICENSE.txt + - --fuzzy-match-generates-todo + files: \.mermaid$ - repo: https://github.com/thlorenz/doctoc.git rev: v1.4.0 hooks: - id: doctoc name: Add TOC for md files - files: ^README\.md$|^CONTRIBUTING\.md$|^UPDATING.md$|^dev/README.md$ + files: ^README\.md$|^CONTRIBUTING\.md$|^UPDATING.*.md$|^dev/README\.md$|^dev/PROVIDER_PACKAGES.md$ args: - "--maxlevel" - "2" - - repo: meta - hooks: - - id: check-hooks-apply - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.5.0 + rev: v3.3.0 hooks: - id: check-merge-conflict - id: debug-statements @@ -151,160 +162,226 @@ repos: - id: check-xml - id: trailing-whitespace - repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.4.4 + rev: v1.6.0 hooks: - id: rst-backticks - id: python-no-log-warn - - repo: local + - repo: https://github.com/adrienverge/yamllint + rev: v1.25.0 hooks: - id: yamllint name: Check yaml files with yamllint - entry: yamllint -c yamllint-config.yml - language: python - additional_dependencies: ['yamllint'] + entry: yamllint -c yamllint-config.yml --strict types: [yaml] - exclude: ^.*init_git_sync\.template\.yaml$|^.*airflow\.template\.yaml$ - - id: shellcheck - name: Check Shell scripts syntax correctness - language: docker_image - entry: koalaman/shellcheck:stable -x -a - files: ^breeze$|^breeze-complete$|\.sh$|^hooks/build$|^hooks/push$|\.bash$|\.bats$ - exclude: ^airflow/_vendor/.*$ - ## - ## Dear committer. - ## - ## If you ever come here to add the missing isort step here - hear a little warning. - ## - ## Initially isort will cause surprising duplicates of urlparse and other urllib related methods. - ## The urllib imports seem broken for python 2 but they are actually fine due to future - ## backport aliases installed elsewhere in the code (implicitly) - in 6 places. - ## - ## When you decide how to fix it (likely talking to other people in community) and you push - ## build to CI you will find terrible truth that in Airflow 1.10 modules are so much - ## cross-dependent, that imports in a number of places have to be done in specific order and - ## if this is not followed properly, circular imports kick-in and you are doomed. - ## - ## Running isort breaks the import House of Cards and there is no easy way to fix it short of - ## splitting a number of files and probably breaking compatibility. - ## - ## Luckily this has been fixed in Airflow 2.0 by proper untangling of the cross-dependencies and - ## 1.10.* branch is really in maintenance mode, so do not really waste your time here. - ## - ## Unless you really want of course. But then either delete this comment or increase the counter - ## below after you give up. - ## - ## Total hours wasted here = 3 - ## + exclude: + ^.*init_git_sync\.template\.yaml$|^.*airflow\.template\.yaml$|^chart/(?:templates|files)/.*\.yaml + ## + ## Dear committer. + ## + ## If you ever come here to add the missing isort step here - hear a little warning. + ## + ## Initially isort will cause surprising duplicates of urlparse and other urllib related methods. + ## The urllib imports seem broken for python 2 but they are actually fine due to future + ## backport aliases installed elsewhere in the code (implicitly) - in 6 places. + ## + ## When you decide how to fix it (likely talking to other people in community) and you push + ## build to CI you will find terrible truth that in Airflow 1.10 modules are so much + ## cross-dependent, that imports in a number of places have to be done in specific order and + ## if this is not followed properly, circular imports kick-in and you are doomed. + ## + ## Running isort breaks the import House of Cards and there is no easy way to fix it short of + ## splitting a number of files and probably breaking compatibility. + ## + ## Luckily this has been fixed in Airflow 2.0 by proper untangling of the cross-dependencies and + ## 1.10.* branch is really in maintenance mode, so do not really waste your time here. + ## + ## Unless you really want of course. But then either delete this comment or increase the counter + ## below after you give up. + ## + ## Total hours wasted here = 3 + ## + - repo: local + hooks: - id: lint-dockerfile name: Lint dockerfile language: system - entry: "./scripts/ci/pre_commit_lint_dockerfile.sh" - files: ^Dockerfile.*$ + entry: "./scripts/ci/pre_commit/pre_commit_lint_dockerfile.sh" + files: Dockerfile.*$ pass_filenames: true - id: setup-order name: Checks for an order of dependencies in setup.py language: python files: ^setup.py$ pass_filenames: false - require_serial: true - entry: tests/test_order_setup.py + entry: ./scripts/ci/pre_commit/pre_commit_check_order_setup.py + - id: setup-installation + name: Checks if all the libraries in setup.py are listed in installation.rst file + language: python + files: ^setup.py$|^docs/installation.rst$ + pass_filenames: false + entry: ./scripts/ci/pre_commit/pre_commit_check_setup_installation.py + additional_dependencies: ['rich==9.2.0'] - id: update-breeze-file name: Update output of breeze command in BREEZE.rst - entry: "./scripts/ci/pre_commit_breeze_cmd_line.sh" + entry: "./scripts/ci/pre_commit/pre_commit_breeze_cmd_line.sh" language: system files: ^BREEZE.rst$|^breeze$|^breeze-complete$ pass_filenames: false - require_serial: true - id: update-local-yml-file name: Update mounts in the local yml file - entry: "./scripts/ci/pre_commit_local_yml_mounts.sh" + entry: "./scripts/ci/pre_commit/pre_commit_local_yml_mounts.sh" language: system - files: ^scripts/ci/_utils.sh$|s^scripts/ci/docker_compose/local.yml" + files: ^scripts/ci/libraries/_local_mounts.sh$|s^scripts/ci/docker_compose/local.yml" pass_filenames: false - require_serial: true - id: update-extras name: Update extras in documentation - entry: "./scripts/ci/pre_commit_update_extras.sh" - language: system + entry: ./scripts/ci/pre_commit/pre_commit_insert_extras.py + language: python files: ^setup.py$|^INSTALL$|^CONTRIBUTING.rst$ pass_filenames: false - require_serial: true - - id: python2-fastcheck + - id: pydevd language: pygrep + name: Check for pydevd debug statements accidentally left + entry: "pydevd.*settrace\\(" + pass_filenames: true + files: \.py$ + - id: dont-use-safe-filter + language: pygrep + name: Don't use safe in templates + description: the Safe filter is error-prone, use Markup() in code instead + entry: "\\|\\s*safe" + files: \.html$ + pass_filenames: true + - id: language-matters + language: pygrep + name: Check for language that we do not accept as community + description: Please use "deny_list" or "allow_list" instead. + entry: "(?i)(black|white)[_-]?list" + pass_filenames: true + exclude: > + (?x) + ^airflow/contrib/hooks/cassandra_hook.py$| + ^airflow/operators/hive_stats_operator.py$| + ^tests/contrib/hooks/test_cassandra_hook.py| + ^CHANGELOG.txt + - id: python2-fastcheck name: Find common Python 3 vs. 2.7 compatibility problems + language: pygrep entry: > - (?x) (?!.*\#\ noqa)(?!.*//\ noqa)( # Exclude lines with '# noqa' or '// noqa' comment .super\(\).*| # Matches super() call from Python 3 ^\s*def\s*\S*\([^:#)]*:.*| # Matches function param with Python3 type ^\sdef\s*\S*\(.*\):\s*\-\>\s*\S*.* # Matches -> return value syntax from Python3 )$ - files: \.py$ - exclude: ^airflow/_vendor|^dev/ - pass_filenames: true - - id: check-providers-package - language: pygrep - name: Find providers package added in 2.0.* - entry: > - (?x) - ( - ^.*airflow\.providers.* # Matches import airflow.providers - )$ - files: \.py$ - exclude: ^airflow/_vendor + exclude: ^dev|^scripts|^docs|^chart pass_filenames: true - id: python2-compile name: Compile code using python2 language: system entry: python2.7 -m py_compile files: \.py$ - exclude: ^dev/ + exclude: ^dev|^scripts|^docs|^chart pass_filenames: true require_serial: true - - id: pydevd + - id: incorrect-use-of-LoggingMixin language: pygrep - name: Check for pydevd debug statements accidentally left - entry: "pydevd.*settrace\\(" - pass_filenames: true + name: Make sure LoggingMixin is not used alone + entry: "LoggingMixin\\(\\)" files: \.py$ - - id: build - name: Check if image build is needed - entry: ./scripts/ci/pre_commit_ci_build.sh 3.5 false + pass_filenames: true + - id: check-integrations + name: Check if integration list is aligned + entry: ./scripts/ci/pre_commit/pre_commit_check_integrations.sh language: system - always_run: true pass_filenames: false + files: ^common/_common_values.sh$|^breeze-complete$ - id: check-apache-license name: Check if licenses are OK for Apache - entry: "./scripts/ci/pre_commit_check_license.sh" + entry: "./scripts/ci/pre_commit/pre_commit_check_license.sh" language: system files: ^.*LICENSE.*$|^.*LICENCE.*$ pass_filenames: false - require_serial: true - id: airflow-config-yaml name: Checks for consistency between config.yml and default_config.cfg language: python - files: "^airflow/config_templates/config.yml$|^airflow/config_templates/default_airflow.cfg$" + entry: ./scripts/ci/pre_commit/pre_commit_yaml_to_cfg.py + files: "config.yml$|default_airflow.cfg$|default.cfg$" pass_filenames: false - require_serial: false - entry: scripts/ci/pre_commit_yaml_to_cfg.py + require_serial: true additional_dependencies: ['pyyaml'] + - id: pre-commit-descriptions + name: Check if pre-commits are described + entry: ./scripts/ci/pre_commit/pre_commit_check_pre_commits.sh + language: system + files: ^.pre-commit-config.yaml$|^STATIC_CODE_CHECKS.rst|^breeze-complete$ + require_serial: true + - id: helm-lint + name: Lint Helm Chart + entry: ./scripts/ci/pre_commit/pre_commit_helm_lint.sh + language: system + pass_filenames: false + files: ^chart + require_serial: true + - id: shellcheck + name: Check Shell scripts syntax correctness + language: docker_image + entry: koalaman/shellcheck:stable -x -a + files: ^breeze$|^breeze-complete$|\.sh$|^hooks/build$|^hooks/push$|\.bash$|\.bats$ + - id: bats-tests + name: Run BATS bash tests for changed bash files + language: system + entry: "./scripts/ci/pre_commit/pre_commit_bat_tests.sh" + files: ^breeze$|^breeze-complete$|\.sh$|\.bash$|\.bats$ + exclude: ^tests/bats/in_container/.*bats$|^scripts/in_container/.*sh$ + pass_filenames: false + - id: pre-commit-descriptions + name: Check if pre-commits are described + entry: ./scripts/ci/pre_commit/pre_commit_check_pre_commits.sh + language: system + files: ^.pre-commit-config.yaml$|^STATIC_CODE_CHECKS.rst|^breeze-complete$ + require_serial: true + - id: sort-in-the-wild + name: Sort INTHEWILD.md alphabetically + entry: ./scripts/ci/pre_commit/pre_commit_sort_in_the_wild.sh + language: system + files: ^.pre-commit-config.yaml$|^INTHEWILD.md$ + require_serial: true + - id: markdownlint + name: Run markdownlint + description: "Checks the style of Markdown files." + entry: markdownlint + language: node + types: [markdown] + files: \.(md|mdown|markdown)$ + additional_dependencies: ['markdownlint-cli'] + - id: build + name: Check if image build is needed + entry: ./scripts/ci/pre_commit/pre_commit_ci_build.sh 3.6 false + language: system + always_run: true + pass_filenames: false - id: mypy name: Run mypy language: system - entry: "./scripts/ci/pre_commit_mypy.sh" + entry: "./scripts/ci/pre_commit/pre_commit_mypy.sh" files: \.py$ - exclude: ^airflow/_vendor/.*$|^dev + exclude: ^dev|^provider_packages|^chart + - id: mypy + name: Run mypy for helm chart tests + language: system + entry: "./scripts/ci/pre_commit/pre_commit_mypy.sh" + files: ^chart/.*\.py$ + exclude: ^dev require_serial: true - id: flake8 name: Run flake8 language: system - entry: "./scripts/ci/pre_commit_flake8.sh" + entry: "./scripts/ci/pre_commit/pre_commit_flake8.sh" files: \.py$ - exclude: ^dev/ pass_filenames: true - - id: bat-tests - name: Run BATS bash tests for changed bash files + - id: bats-in-container-tests + name: Run in container bats tests language: system - entry: "./scripts/ci/pre_commit_bat_tests.sh" - files: ^breeze$|^breeze-complete$|\.sh$|\.bash$ + entry: "./scripts/ci/pre_commit/pre_commit_in_container_bats_test.sh" + files: ^tests/bats/in_container/.*.bats$|^scripts/in_container/.*sh pass_filenames: false diff --git a/.rat-excludes b/.rat-excludes index 906d8044452df..69c8ccc960328 100644 --- a/.rat-excludes +++ b/.rat-excludes @@ -28,6 +28,7 @@ metastore_db .*sql .*svg .*csv +.*md5 CHANGELOG.txt .*zip .*lock @@ -74,11 +75,16 @@ apache-airflow-.*\+source.tar.gz.* apache-airflow-.*\+bin.tar.gz.* PULL_REQUEST_TEMPLATE.md -# vendored modules -_vendor/* # Locally mounted files .*egg-info/* .bash_history .bash_aliases .inputrc + +# the example notebook is ASF 2 licensed but RAT cannot read this +input_notebook.ipynb + +# .git might be a file in case of worktree +.git +tmp diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index dffb384a2eab1..0000000000000 --- a/.travis.yml +++ /dev/null @@ -1,233 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# ---- -dist: bionic -language: python -os: linux -env: - global: - - BUILD_ID=${TRAVIS_BUILD_ID} - - MOUNT_LOCAL_SOURCES="false" - - MOUNT_HOST_AIRFLOW_VOLUME="true" - - FORCE_ANSWER_TO_QUESTIONS="yes" - - SKIP_CHECK_REMOTE_IMAGE="true" - - DB_RESET="true" - - VERBOSE="true" - - CI="true" -python: "3.6" -stages: - - pre-test - - test -services: - - docker -jobs: - include: - - name: "Static checks" - stage: pre-test - script: ./scripts/ci/ci_run_all_static_tests.sh - env: >- - PYTHON_MAJOR_MINOR_VERSION=3.5 - AIRFLOW_MOUNT_SOURCE_DIR_FOR_STATIC_CHECKS="true" - - name: "Build documentation" - env: >- - PYTHON_MAJOR_MINOR_VERSION=3.5 - stage: test - script: ./scripts/ci/ci_docs.sh - - name: "Tests [Py3.6][Kubernetes][persistent]" - env: >- - BACKEND=postgres - PYTHON_MAJOR_MINOR_VERSION=3.6 - RUNTIME=kubernetes - ENABLE_KIND_CLUSTER=true - KUBERNETES_MODE=persistent_mode - KUBERNETES_VERSION=v1.15.3 - stage: test - - name: "Tests [Py3.5][Kubernetes][git]" - env: >- - BACKEND=postgres - PYTHON_MAJOR_MINOR_VERSION=3.5 - RUNTIME=kubernetes - ENABLE_KIND_CLUSTER=true - KUBERNETES_MODE=git_mode - KUBERNETES_VERSION=v1.15.3 - stage: test - - name: "Tests [Py2.7][Kubernetes][persistent]" - env: >- - BACKEND=postgres - PYTHON_MAJOR_MINOR_VERSION=2.7 - ENABLE_KIND_CLUSTER=true - RUNTIME=kubernetes - KUBERNETES_MODE=persistent_mode - KUBERNETES_VERSION=v1.15.3 - stage: test - - name: "Tests [Py2.7][Kubernetes][git]" - env: >- - BACKEND=postgres - PYTHON_MAJOR_MINOR_VERSION=2.7 - ENABLE_KIND_CLUSTER=true - RUNTIME=kubernetes - KUBERNETES_MODE=git_mode - KUBERNETES_VERSION=v1.15.3 - stage: test - - name: "Tests [Postgres9.6][Py3.6][integrations]" - env: >- - BACKEND=postgres - PYTHON_MAJOR_MINOR_VERSION=3.6 - POSTGRES_VERSION=9.6 - ENABLED_INTEGRATIONS="cassandra kerberos mongo openldap rabbitmq redis" - RUN_INTEGRATION_TESTS=all - stage: test - - name: "Tests [Postgres9.6][Py3.6][kerberos]" - env: >- - BACKEND=postgres - PYTHON_MAJOR_MINOR_VERSION=3.6 - POSTGRES_VERSION=9.6 - ENABLED_INTEGRATIONS="kerberos" - stage: test - - name: "Tests [Postgres10][Py3.6][integrations]" - env: >- - BACKEND=postgres - PYTHON_MAJOR_MINOR_VERSION=3.6 - POSTGRES_VERSION=10 - ENABLED_INTEGRATIONS="cassandra kerberos mongo openldap rabbitmq redis" - RUN_INTEGRATION_TESTS=all - stage: test - - name: "Tests [Postgres9.6][Py3.7][kerberos]" - env: >- - BACKEND=postgres - PYTHON_MAJOR_MINOR_VERSION=3.6 - POSTGRES_VERSION=9.6 - ENABLED_INTEGRATIONS="kerberos" - stage: test - - name: "Tests [Postgres10][Py3.7][integrations]" - env: >- - BACKEND=postgres - PYTHON_MAJOR_MINOR_VERSION=3.6 - POSTGRES_VERSION=10 - ENABLED_INTEGRATIONS="cassandra kerberos mongo openldap rabbitmq redis" - RUN_INTEGRATION_TESTS=all - stage: test - - name: "Tests [Postgres10][Py3.6][kerberos]" - env: >- - BACKEND=postgres - PYTHON_MAJOR_MINOR_VERSION=3.6 - POSTGRES_VERSION=10 - ENABLED_INTEGRATIONS="kerberos" - stage: test - - name: "Tests [Sqlite][Py2.7][integrations]" - env: >- - BACKEND=sqlite - PYTHON_MAJOR_MINOR_VERSION=2.7 - ENABLED_INTEGRATIONS="cassandra kerberos mongo openldap rabbitmq redis" - RUN_INTEGRATION_TESTS=all - stage: test - - name: "Tests [Sqlite][Py3.5]" - env: >- - BACKEND=sqlite - PYTHON_MAJOR_MINOR_VERSION=3.5 - stage: test - - name: "Tests [MySQL5.6][Py3.6][integrations]" - env: >- - BACKEND=mysql - PYTHON_MAJOR_MINOR_VERSION=3.6 - MYSQL_VERSION=5.6 - ENABLED_INTEGRATIONS="cassandra kerberos mongo openldap rabbitmq redis" - RUN_INTEGRATION_TESTS=all - stage: test - - name: "Tests [MySQL5.6][Py2.7][kerberos]" - env: >- - BACKEND=mysql - PYTHON_MAJOR_MINOR_VERSION=2.7 - ENABLED_INTEGRATIONS="kerberos" - MYSQL_VERSION=5.6 - stage: test - - name: "Tests [MySQL5.7][Py3.6][integrations]" - env: >- - BACKEND=mysql - PYTHON_MAJOR_MINOR_VERSION=3.6 - MYSQL_VERSION=5.7 - ENABLED_INTEGRATIONS="cassandra kerberos mongo openldap rabbitmq redis" - RUN_INTEGRATION_TESTS=all - stage: test - - name: "Tests [MySQL5.7][Py2.7][kerberos]" - env: >- - BACKEND=mysql - PYTHON_MAJOR_MINOR_VERSION=2.7 - MYSQL_VERSION=5.7 - ENABLED_INTEGRATIONS="kerberos" - MYSQL_VERSION=5.7 - stage: test - - name: "Tests [MySQL5.7][Py3.7]" - env: >- - BACKEND=mysql - PYTHON_MAJOR_MINOR_VERSION=3.7 - MYSQL_VERSION=5.7 - stage: test - - name: "Generate requirements Py2.7" - env: >- - PYTHON_MAJOR_MINOR_VERSION=2.7 - stage: test - script: ./scripts/ci/ci_generate_requirements.sh - - name: "Generate requirements Py3.5" - env: >- - PYTHON_MAJOR_MINOR_VERSION=3.5 - stage: test - script: ./scripts/ci/ci_generate_requirements.sh - - name: "Generate requirements Py3.6" - env: >- - PYTHON_MAJOR_MINOR_VERSION=3.6 - SHOW_GENERATE_REQUIREMENTS_INSTRUCTIONS="true" - stage: test - script: ./scripts/ci/ci_generate_requirements.sh - - name: "Generate requirements Py3.7" - env: >- - PYTHON_MAJOR_MINOR_VERSION=3.7 - SHOW_GENERATE_REQUIREMENTS_INSTRUCTIONS="true" - stage: test - script: ./scripts/ci/ci_generate_requirements.sh - - name: "Build production image Py2.7" - env: >- - PYTHON_MAJOR_MINOR_VERSION="2.7" - stage: test - script: ./scripts/ci/ci_build_production_image.sh - before_install: - - echo - - name: "Build production image Py3.5" - env: >- - PYTHON_MAJOR_MINOR_VERSION="3.5" - stage: test - script: ./scripts/ci/ci_build_production_image.sh - before_install: - - echo - - name: "Build production image Py3.6" - env: >- - PYTHON_MAJOR_MINOR_VERSION="3.6" - stage: test - script: ./scripts/ci/ci_build_production_image.sh - before_install: - - echo - - name: "Build production image Py3.7" - env: >- - PYTHON_MAJOR_MINOR_VERSION="3.7" - stage: test - script: ./scripts/ci/ci_build_production_image.sh - before_install: - - echo -before_install: - - ./scripts/ci/ci_before_install.sh -script: ./scripts/ci/ci_run_airflow_testing.sh diff --git a/BREEZE.rst b/BREEZE.rst index 6b294ccaee719..a079b4effbc63 100644 --- a/BREEZE.rst +++ b/BREEZE.rst @@ -15,35 +15,38 @@ specific language governing permissions and limitations under the License. -.. image:: images/AirflowBreeze_logo.png - :align: center - :alt: Airflow Breeze Logo +.. raw:: html + +
+ Airflow Breeze - Development and Test Environment for Apache Airflow +
.. contents:: :local: -Airflow Breeze CI Environment +Airflow Breeze CI environment ============================= -Airflow Breeze is an easy-to-use development environment using +Airflow Breeze is an easy-to-use development and test environment using `Docker Compose `_. The environment is available for local use and is also used in Airflow's CI tests. -We called it *Airflow Breeze* as **It's a Breeze to develop Airflow**. +We called it *Airflow Breeze* as **It's a Breeze to contribute to Airflow**. The advantages and disadvantages of using the Breeze environment vs. other ways of testing Airflow are described in `CONTRIBUTING.rst `_. -Here is a short 10-minute video about Airflow Breeze (note that it shows an old version of Breeze. Some -of the points in the video are not valid any more. The video will be updated shortly with more up-to-date -version): +Watch the video below about Airflow Breeze. It explains the motivation for Breeze +and screencasts all its uses. + +.. raw:: html -.. image:: http://img.youtube.com/vi/ffKFHV6f3PQ/0.jpg - :width: 480px - :height: 360px - :scale: 100 % - :alt: Airflow Breeze Simplified Development Workflow - :align: center - :target: http://www.youtube.com/watch?v=ffKFHV6f3PQ +
+ + Airflow Breeze - Development and Test Environment for Apache Airflow + +
Prerequisites ============= @@ -66,9 +69,12 @@ Docker Community Edition Here is an example configuration with more than 200GB disk space for Docker: -.. image:: images/disk_space_osx.png - :align: left - :alt: Disk space OSX +.. raw:: html + +
+ Disk space MacOS +
Docker Compose -------------- @@ -76,44 +82,52 @@ Docker Compose - **Version**: Install the latest stable Docker Compose and add it to the PATH. See `Docker Compose Installation Guide `_ for details. -- **Permissions**: Configure to run the ``docker-compose`` command. +- **Permissions**: Configure permission to run the ``docker-compose`` command. -Docker Images Used by Breeze ----------------------------- +Docker in WSL 2 +--------------- -For all development tasks, unit tests, integration tests and static code checks, we use the -**CI image** maintained on the DockerHub in the ``apache/airflow`` repository. -This Docker image contains a lot test-related packages (size of ~1GB). -Its tag follows the pattern of ``-python-ci`` -(for example, ``apache/airflow:master-python3.6-ci`` or ``apache/airflow:v1-10-test-python3.6-ci``). -The image is built using the ``_ Dockerfile. +- **WSL 2 installation** : + Install WSL 2 and a Linux Distro (e.g. Ubuntu) see + `WSL 2 Installation Guide `_ for details. -For testing production image, the **Production image** is used and maintained on the DockerHub in the -```apache/airflow`` repository. This Docker image contains only size-optimised Airflow with selected -extras and dependencies. Its tag follows the pattern of ``-python`` -(for example, ``apache/airflow:master-python3.6`` or ``apache/airflow:v1-10-test-python3.6``). +- **Docker Desktop installation** : + Install Docker Desktop for Windows. For Windows Home follow the + `Docker Windows Home Installation Guide `_. + For Windows Pro, Enterprise, or Education follow the + `Docker Windows Installation Guide `_. -More information about the images can be found in ``_. +- **Docker setting** : + WSL integration needs to be enabled -By default CI images are used unless ``--production-image`` flag is used. +.. raw:: html -Before you run tests, enter the environment or run local static checks, the necessary local images should be -pulled and built from Docker Hub. This happens automatically for the test environment but you need to -manually trigger it for static checks as described in `Building the images <#building-the-images>`_ -and `Pulling the latest images <#pulling-the-latest-images>`_. -The static checks will fail and inform what to do if the image is not yet built. +
+ Airflow Breeze - Docker WSL2 integration +
-Building the image first time pulls a pre-built version of images from the Docker Hub, which may take some -time. But for subsequent source code changes, no wait time is expected. -However, changes to sensitive files like ``setup.py`` or ``Dockerfile.ci`` will trigger a rebuild -that may take more time though it is highly optimized to only rebuild what is needed. +- **WSL 2 Filesystem Performance** : + Accessing the host Windows filesystem incurs a performance penalty, + it is therefore recommended to do development on the Linux filesystem. + E.g. Run ``cd ~`` and create a development folder in your Linux distro home + and git pull the Airflow repo there. -In most cases, rebuilding an image requires network connectivity (for example, to download new -dependencies). If you work offline and do not want to rebuild the images when needed, you can set the -``FORCE_ANSWER_TO_QUESTIONS`` variable to ``no`` as described in the -`Default behaviour for user interaction <#default-behaviour-for-user-interaction>`_ section. +- **WSL 2 Memory Usage** : + WSL 2 can consume a lot of memory under the process name "Vmmem". To reclaim the memory after + development you can: -See `Troubleshooting section <#troubleshooting>`_ for steps you can make to clean the environment. + * On the Linux distro clear cached memory: ``sudo sysctl -w vm.drop_caches=3`` + * If no longer using Docker you can quit Docker Desktop + (right click system try icon and select "Quit Docker Desktop") + * If no longer using WSL you can shut it down on the Windows Host + with the following command: ``wsl --shutdown`` + +- **Developing in WSL 2** : + You can use all the standard Linux command line utilities to develop on WSL 2. + Further VS Code supports developing in Windows but remotely executing in WSL. + If VS Code is installed on the Windows host system then in the WSL Linux Distro + you can run ``code .`` in the root directory of you Airflow repo to launch VS Code. Getopt and gstat ---------------- @@ -141,7 +155,6 @@ If you use zsh, run this command and re-login: echo 'export PATH="/usr/local/opt/gnu-getopt/bin:$PATH"' >> ~/.zprofile . ~/.zprofile - Memory ------ @@ -151,45 +164,155 @@ On macOS, 2GB of RAM are available for your Docker containers by default, but mo (4GB should be comfortable). For details see `Docker for Mac - Advanced tab `_. -Airflow Directory Structure inside Docker ------------------------------------------ +On Windows WSL 2 expect the Linux Distro and Docker containers to use 7 - 8 GB of RAM. -When you are in the CI container, the following directories are used: +Cleaning the environment +------------------------ -.. code-block:: text +You may need to clean up your Docker environment occasionally. The images are quite big +(1.5GB for both images needed for static code analysis and CI tests) and, if you often rebuild/update +them, you may end up with some unused image data. - /opt/airflow - Contains sources of Airflow mounted from the host (AIRFLOW_SOURCES). - /root/airflow - Contains all the "dynamic" Airflow files (AIRFLOW_HOME), such as: - airflow.db - sqlite database in case sqlite is used; - dags - folder with non-test dags (test dags are in /opt/airflow/tests/dags); - logs - logs from Airflow executions; - unittest.cfg - unit test configuration generated when entering the environment; - webserver_config.py - webserver configuration generated when running Airflow in the container. +To clean up the Docker environment: -Note that when running in your local environment, the ``/root/airflow/logs`` folder is actually mounted -from your ``logs`` directory in the Airflow sources, so all logs created in the container are automatically -visible in the host as well. Every time you enter the container, the ``logs`` directory is -cleaned so that logs do not accumulate. +1. Stop Breeze with ``./breeze stop``. -When you are in the production container, the following directories are used: +2. Run the ``docker system prune`` command. -.. code-block:: text +3. Run ``docker images --all`` and ``docker ps --all`` to verify that your Docker is clean. - /opt/airflow - Contains sources of Airflow mounted from the host (AIRFLOW_SOURCES). - /root/airflow - Contains all the "dynamic" Airflow files (AIRFLOW_HOME), such as: - airflow.db - sqlite database in case sqlite is used; - dags - folder with non-test dags (test dags are in /opt/airflow/tests/dags); - logs - logs from Airflow executions; - unittest.cfg - unit test configuration generated when entering the environment; - webserver_config.py - webserver configuration generated when running Airflow in the container. + Both commands should return an empty list of images and containers respectively. -Note that when running in your local environment, the ``/root/airflow/logs`` folder is actually mounted -from your ``logs`` directory in the Airflow sources, so all logs created in the container are automatically -visible in the host as well. Every time you enter the container, the ``logs`` directory is -cleaned so that logs do not accumulate. +If you run into disk space errors, consider pruning your Docker images with the ``docker system prune --all`` +command. You may need to restart the Docker Engine before running this command. + +In case of disk space errors on macOS, increase the disk space available for Docker. See +`Prerequisites <#prerequisites>`_ for details. + + +Installation +============ + +Installation is as easy as checking out Airflow repository and running Breeze command. +You enter the Breeze test environment by running the ``./breeze`` script. You can run it with +the ``help`` command to see the list of available options. See `Breeze Command-Line Interface Reference`_ +for details. + +.. code-block:: bash + + ./breeze + +The First time you run Breeze, it pulls and builds a local version of Docker images. +It pulls the latest Airflow CI images from `Airflow DockerHub `_ +and uses them to build your local Docker images. Note that the first run (per python) might take up to 10 +minutes on a fast connection to start. Subsequent runs should be much faster. + +Once you enter the environment, you are dropped into bash shell of the Airflow container and you can +run tests immediately. + +To use the full potential of breeze you should set up autocomplete and you can +add the checked-out Airflow repository to your PATH to run Breeze without the ``./`` and from any directory. + +The ``breeze`` command comes with a built-in bash/zsh autocomplete setup command. After installing, when you +start typing the command, you can use to show all the available switches and get +auto-completion on typical values of parameters that you can use. + +You should set up the autocomplete option automatically by running: + +.. code-block:: bash + + ./breeze setup-autocomplete + +You get the auto-completion working when you re-enter the shell. + +When you enter the Breeze environment, automatically an environment file is sourced from +``files/airflow-breeze-config/variables.env``. The ``files`` folder from your local sources is +automatically mounted to the container under ``/files`` path and you can put there any files you want +to make available for the Breeze container. + +.. raw:: html + +
+ + Airflow Breeze - Installation + +
+ +Running tests in the CI interactive environment +=============================================== + +Breeze helps with running tests in the same environment/way as CI tests are run. You can run various +types of tests while you enter Breeze CI interactive environment - this is described in detail +in ``_ + +.. raw:: html + +
+ + Airflow Breeze - Running tests + +
+ +Choosing different Breeze environment configuration +=================================================== + +You can use additional ``breeze`` flags to choose your environment. You can specify a Python +version to use, and backend (the meta-data database). Thanks to that, with Breeze, you can recreate the same +environments as we have in matrix builds in the CI. + +For example, you can choose to run Python 3.6 tests with MySQL as backend and in the Docker environment as +follows: + +.. code-block:: bash + + ./breeze --python 3.6 --backend mysql + +The choices you make are persisted in the ``./.build/`` cache directory so that next time when you use the +``breeze`` script, it could use the values that were used previously. This way you do not have to specify +them when you run the script. You can delete the ``.build/`` directory in case you want to restore the +default settings. + +The defaults when you run the Breeze environment are Python 3.6 version and SQLite database. + +.. raw:: html + +
+ + Airflow Breeze - Selecting Python and Backend version + +
+ + +Troubleshooting +=============== + +If you are having problems with the Breeze environment, try the steps below. After each step you +can check whether your problem is fixed. + +1. If you are on macOS, check if you have enough disk space for Docker. +2. Restart Breeze with ``./breeze restart``. +3. Delete the ``.build`` directory and run ``./breeze build-image --force-pull-images``. +4. Clean up Docker images via ``breeze cleanup-image`` command. +5. Restart your Docker Engine and try again. +6. Restart your machine and try again. +7. Re-install Docker CE and try again. + +In case the problems are not solved, you can set the VERBOSE_COMMANDS variable to "true": + +.. code-block:: -Using the Airflow Breeze Environment -===================================== + export VERBOSE_COMMANDS="true" + + +Then run the failed command, copy-and-paste the output from your terminal to the +`Airflow Slack `_ #airflow-breeze channel and +describe your problem. + +Other uses of the Airflow Breeze environment +============================================ Airflow Breeze is a bash script serving as a "swiss-army-knife" of Airflow testing. Under the hood it uses other scripts that you can also run manually if you have problem with running the Breeze @@ -197,27 +320,52 @@ environment. Breeze script allows performing the following tasks: -Manage environments - CI (default) or Production - if ``--production-image`` flag is specified: +Managing CI environment: - * Build docker images with ``breeze build-image`` command - * Enter interactive shell when no command are specified (default behaviour) + * Build CI docker image with ``breeze build-image`` command + * Enter interactive shell in CI container when ``shell`` (or no command) is specified * Join running interactive shell with ``breeze exec`` command - * Start Kind Kubernetes cluster for Kubernetes tests if ``--start-kind-cluster`` flag is specified * Stop running interactive environment with ``breeze stop`` command * Restart running interactive environment with ``breeze restart`` command - * Optionally reset database if specified as extra ``--db-reset`` flag - * Optionally start integrations (separate images) if specified as extra ``--integration`` flags (only CI) + * Run test specified with ``breeze tests`` command + * Generate constraints with ``breeze generate-constraints`` command + * Execute arbitrary command in the test environment with ``breeze shell`` command + * Execute arbitrary docker-compose command with ``breeze docker-compose`` command + * Push docker images with ``breeze push-image`` command (require committer's rights to push images) + +You can optionally reset database if specified as extra ``--db-reset`` flag and for CI image you can also +start integrations (separate Docker images) if specified as extra ``--integration`` flags. You can also +chose which backend database should be used with ``--backend`` flag and python version with ``--python`` flag. -Interact with CI environment: +You can also have breeze launch Airflow automatically ``breeze start-airflow``, this will drop you in a +tmux session with three panes (one to monitor the scheduler, one for the webserver and one with a shell +for additional commands. - * Run test target specified with ``breeze test-target`` command - * Execute arbitrary command in the test environment with ``breeze execute-command`` command +Managing Prod environment (with ``--production-image`` flag): + + * Build CI docker image with ``breeze build-image`` command + * Enter interactive shell in PROD container when ``shell`` (or no command) is specified + * Join running interactive shell with ``breeze exec`` command + * Stop running interactive environment with ``breeze stop`` command + * Restart running interactive environment with ``breeze restart`` command + * Execute arbitrary command in the test environment with ``breeze shell`` command * Execute arbitrary docker-compose command with ``breeze docker-compose`` command + * Push docker images with ``breeze push-image`` command (require committer's rights to push images) + +You can optionally reset database if specified as extra ``--db-reset`` flag. You can also +chose which backend database should be used with ``--backend`` flag and python version with ``--python`` flag. + + +Manage and Interact with Kubernetes tests environment: + + * Manage KinD Kubernetes cluster and deploy Airflow to KinD cluster ``breeze kind-cluster`` commands + * Run Kubernetes tests specified with ``breeze kind-cluster tests`` command + * Enter the interactive kubernetes test environment with ``breeze kind-cluster shell`` command Run static checks: * Run static checks - either for currently staged change or for all files with - ``breeze static-check`` or ``breeze static-check-all-files`` command + ``breeze static-check`` command Build documentation: @@ -228,38 +376,15 @@ Set up local development environment: * Setup local virtualenv with ``breeze setup-virtualenv`` command * Setup autocomplete for itself with ``breeze setup-autocomplete`` command +Database volumes in Breeze +-------------------------- -Note that the below environment interaction is by default with the CI image. If you want to use production -image for those commands you need to add ``--production-image`` flag. - - -Entering Breeze CI environment ------------------------------- - -You enter the Breeze test environment by running the ``./breeze`` script. You can run it with -the ``help`` command to see the list of available options. See `Breeze Command-Line Interface Reference`_ -for details. - -.. code-block:: bash - - ./breeze - -First time you run Breeze, it pulls and builds a local version of Docker images. -It pulls the latest Airflow CI images from `Airflow DockerHub `_ -and use them to build your local Docker images. Note that the first run (per python) might take up to 10 -minutes on a fast connection to start. Subsequent runs should be much faster. - -Once you enter the environment, you are dropped into bash shell of the Airflow container and you can -run tests immediately. - -You can `set up autocomplete <#setting-up-autocomplete>`_ for commands and add the -checked-out Airflow repository to your PATH to run Breeze without the ``./`` and from any directory. - - -When you enter the Breeze environment, automatically an environment file is sourced from -``files/airflow-breeze-config/variables.env``. The ``files`` folder from your local sources is -automatically mounted to the container under ``/files`` path and you can put there any files you want -to make available fot the Breeze container. +Breeze keeps data for all it's integration in named docker volumes. Each backend and integration +keeps data in their own volume. Those volumes are persisted until ``./breeze stop`` command or +``./breeze restart`` command is run. You can also preserve the volumes by adding flag +``--preserve-volumes`` when you run either of those commands. Then, next time when you start +``Breeze``, it will have the data pre-populated. You can always delete the volumes by +running ``./breeze stop`` without the ``--preserve-volumes`` flag. Launching multiple terminals ---------------------------- @@ -268,63 +393,91 @@ Often if you want to run full airflow in the Breeze environment you need to laun run ``airflow webserver``, ``airflow scheduler``, ``airflow worker`` in separate terminals. This can be achieved either via ``tmux`` or via exec-ing into the running container from the host. Tmux -is installed inside the container and you can launch it with ``tmux`` command. Tmux provide you with the +is installed inside the container and you can launch it with ``tmux`` command. Tmux provides you with the capability of creating multiple virtual terminals and multiplex between them. More about ``tmux`` can be -found at `tmux github wiki page `_ . Tmux has several useful shortcuts +found at `tmux GitHub wiki page `_ . Tmux has several useful shortcuts that allow you to split the terminals, open new tabs etc - it's pretty useful to learn it. -Another - slightly easier - way is to exec into Breeze terminal from the host's terminal. Often you can +.. raw:: html + +
+ + Airflow Breeze - Using tmux + +
+ + +Another way is to exec into Breeze terminal from the host's terminal. Often you can have multiple terminals in the host (Linux/MacOS/WSL2 on Windows) and you can simply use those terminals -to enter running container. It's as easy as launching ``breeze exec`` while you already started the +to enter the running container. It's as easy as launching ``breeze exec`` while you already started the Breeze environment. You will be dropped into bash and environment variables will be read in the same way as when you enter the environment. You can do it multiple times and open as many terminals as you need. -Stopping Interactive environment --------------------------------- - -After starting up, the environment runs in the background and takes precious memory. -You can always stop it via: +.. raw:: html -.. code-block:: bash +
+ + Airflow Breeze - Using tmux + +
- ./breeze stop -Restarting Breeze environment ------------------------------ +CLIs for cloud providers +------------------------ -You can also restart the environment and enter it via: +For development convenience we installed simple wrappers for the most common cloud providers CLIs. Those +CLIs are not installed when you build or pull the image - they will be downloaded as docker images +the first time you attempt to use them. It is downloaded and executed in your host's docker engine so once +it is downloaded, it will stay until you remove the downloaded images from your host container. -.. code-block:: bash +For each of those CLI credentials are taken (automatically) from the credentials you have defined in +your ${HOME} directory on host. - ./breeze restart +Those tools also have host Airflow source directory mounted in /opt/airflow path +so you can directly transfer files to/from your airflow host sources. -Choosing a Breeze Environment ------------------------------ +Those are currently installed CLIs (they are available as aliases to the docker commands): -You can use additional ``breeze`` flags to customize your environment. For example, you can specify a Python -version to use, backend and a container environment for testing. With Breeze, you can recreate the same -environments as we have in matrix builds in Travis CI. ++-----------------------+----------+-------------------------------------------------+-------------------+ +| Cloud Provider | CLI tool | Docker image | Configuration dir | ++=======================+==========+=================================================+===================+ +| Amazon Web Services | aws | amazon/aws-cli:latest | .aws | ++-----------------------+----------+-------------------------------------------------+-------------------+ +| Microsoft Azure | az | mcr.microsoft.com/azure-cli:latest | .azure | ++-----------------------+----------+-------------------------------------------------+-------------------+ +| Google Cloud | bq | gcr.io/google.com/cloudsdktool/cloud-sdk:latest | .config/gcloud | +| +----------+-------------------------------------------------+-------------------+ +| | gcloud | gcr.io/google.com/cloudsdktool/cloud-sdk:latest | .config/gcloud | +| +----------+-------------------------------------------------+-------------------+ +| | gsutil | gcr.io/google.com/cloudsdktool/cloud-sdk:latest | .config/gcloud | ++-----------------------+----------+-------------------------------------------------+-------------------+ -For example, you can choose to run Python 3.6 tests with MySQL as backend and in the Docker environment as -follows: +For each of the CLIs we have also an accompanying ``*-update`` alias (for example ``aws-update``) which +will pull the latest image for the tool. Note that all Google Cloud tools are served by one +image and they are updated together. -.. code-block:: bash +Also - in case you run several different Breeze containers in parallel (from different directories, +with different versions) - they docker images for CLI Cloud Providers tools are shared so if you update it +for one Breeze container, they will also get updated for all the other containers. - ./breeze --python 3.6 --backend mysql +.. raw:: html -The choices you make are persisted in the ``./.build/`` cache directory so that next time when you use the -``breeze`` script, it could use the values that were used previously. This way you do not have to specify -them when you run the script. You can delete the ``.build/`` directory in case you want to restore the -default settings. +
+ + Airflow Breeze - Cloud tools + +
-The defaults when you run the Breeze environment are Python 3.6, Sqlite, and Docker. -Launching Breeze Integrations +Launching Breeze integrations ----------------------------- When Breeze starts, it can start additional integrations. Those are additional docker containers that are started in the same docker-compose command. Those are required by some of the tests -as described in `TESTING.rst `_. +as described in ``_. By default Breeze starts only airflow container without any integration enabled. If you selected ``postgres`` or ``mysql`` backend, the container for the selected backend is also started (but only the one @@ -338,199 +491,273 @@ Once integration is started, it will continue to run until the environment is st Note that running integrations uses significant resources - CPU and memory. -Cleaning the Environment ------------------------- +.. raw:: html -You may need to clean up your Docker environment occasionally. The images are quite big -(1.5GB for both images needed for static code analysis and CI tests) and, if you often rebuild/update -them, you may end up with some unused image data. +
+ + Airflow Breeze - Integrations + +
-To clean up the Docker environment: +Building CI images +------------------ -1. Stop Breeze with ``./breeze stop``. +With Breeze you can build images that are used by Airflow CI and production ones. -2. Run the ``docker system prune`` command. +For all development tasks, unit tests, integration tests, and static code checks, we use the +**CI image** maintained on the DockerHub in the ``apache/airflow`` repository. +This Docker image contains a lot of test-related packages (size of ~1GB). +Its tag follows the pattern of ``-python-ci`` +(for example, ``apache/airflow:master-python3.6-ci`` or ``apache/airflow:v1-10-test-python3.6-ci``). +The image is built using the ``_ Dockerfile. -3. Run ``docker images --all`` and ``docker ps --all`` to verify that your Docker is clean. +The CI image is built automatically as needed, however it can be rebuilt manually with +``build-image`` command. The production +image should be built manually - but also a variant of this image is built automatically when +kubernetes tests are executed see `Running Kubernetes tests <#running-kubernetes-tests>`_ - Both commands should return an empty list of images and containers respectively. +.. raw:: html -If you run into disk space errors, consider pruning your Docker images with the ``docker system prune --all`` -command. You may need to restart the Docker Engine before running this command. +
+ + Airflow Breeze - Building images + +
-In case of disk space errors on macOS, increase the disk space available for Docker. See -`Prerequisites <#prerequisites>`_ for details. +Building the image first time pulls a pre-built version of images from the Docker Hub, which may take some +time. But for subsequent source code changes, no wait time is expected. +However, changes to sensitive files like ``setup.py`` or ``Dockerfile.ci`` will trigger a rebuild +that may take more time though it is highly optimized to only rebuild what is needed. -Running Arbitrary Commands in the Breeze Environment -------------------------------------------------------- +Breeze has built in mechanism to check if your local image has not diverged too much from the +latest image build on CI. This might happen when for example latest patches have been released as new +Python images or when significant changes are made in the Dockerfile. In such cases, Breeze will +download the latest images before rebuilding because this is usually faster than rebuilding the image. -To run other commands/executables inside the Breeze Docker-based environment, use the -``./breeze execute-command`` command. To add arguments, specify them -together with the command surrounded with either ``"`` or ``'``, or pass them after ``--`` as extra arguments. +In most cases, rebuilding an image requires network connectivity (for example, to download new +dependencies). If you work offline and do not want to rebuild the images when needed, you can set the +``FORCE_ANSWER_TO_QUESTIONS`` variable to ``no`` as described in the +`Setting default behaviour for user interaction <#setting-default-behaviour-for-user-interaction>`_ section. -.. code-block:: bash +Preparing packages +------------------ - ./breeze execute-command "ls -la" +Breeze can also be used to prepare airflow packages - both "apache-airflow" main package and +provider packages. -.. code-block:: bash +You can read more about testing provider packages in +`TESTING.rst `_ - ./breeze execute-command ls -- --la +There are several commands that you can run in Breeze to manage and build packages: +* preparing Provider Readme files +* preparing Airflow packages +* preparing Provider packages -Running Docker Compose Commands -------------------------------- +Preparing provider readme files is part of the release procedure by the release managers +and it is described in detail in `dev `_ . -To run Docker Compose commands (such as ``help``, ``pull``, etc), use the -``docker-compose`` command. To add extra arguments, specify them -after ``--`` as extra arguments. +You can prepare provider packages - by default regular provider packages are prepared, but with +``--backport`` flag you can prepare backport packages. + +The packages are prepared in ``dist`` folder. Note, that this command cleans up the ``dist`` folder +before running, so you should run it before generating airflow package below as it will be removed. + +The below example builds provider packages in the wheel format. .. code-block:: bash - ./breeze docker-compose pull -- --ignore-pull-failures + ./breeze prepare-provider-packages +If you run this command without packages, you will prepare all packages, you can however specify +providers that you would like to build. By default only ``wheel`` packages are prepared, +but you can change it providing optional --package-format flag. -Mounting Local Sources to Breeze --------------------------------- -Important sources of Airflow are mounted inside the ``airflow`` container that you enter. -This means that you can continue editing your changes on the host in your favourite IDE and have them -visible in the Docker immediately and ready to test without rebuilding images. You can disable mounting -by specifying ``--skip-mounting-local-sources`` flag when running Breeze. In this case you will have sources -embedded in the container and changes to these sources will not be persistent. +.. code-block:: bash + ./breeze prepare-provider-packages --package-format=both google amazon -After you run Breeze for the first time, you will have an empty directory ``files`` in your source code, -which will be mapped to ``/files`` in your Docker container. You can pass there any files you need to -configure and run Docker. They will not be removed between Docker runs. +You can also prepare backport provider packages, if you specify ``--backport`` flag. You can read more +about backport packages in `dev `_ -By default ``/files/dags`` folder is mounted from your local ``/files/dags`` and this is -the directory used by airflow scheduler and webserver to scan dags for. You can use it to test your dags -from local sources in Airflow. If you wish to add local DAGs that can be run by Breeze. +.. code-block:: bash -Adding/Modifying Dependencies ------------------------------ + ./breeze prepare-provider-packages --backports --package-format=both google amazon -If you need to change apt dependencies in the ``Dockerfile.ci``, add Python packages in ``setup.py`` or -add javascript dependencies in ``package.json``, you can either add dependencies temporarily for a single -Breeze session or permanently in ``setup.py``, ``Dockerfile.ci``, or ``package.json`` files. +You can see all providers available by running this command: -Installing Dependencies for a Single Breeze Session -................................................... +.. code-block:: bash -You can install dependencies inside the container using ``sudo apt install``, ``pip install`` or -``yarn install`` (in ``airflow/www`` folder) respectively. This is useful if you want to test something -quickly while you are in the container. However, these changes are not retained: they disappear once you -exit the container (except for the node.js dependencies if your sources are mounted to the container). -Therefore, if you want to retain a new dependency, follow the second option described below. + ./breeze prepare-provider-packages -- --help -Adding Dependencies Permanently -............................... -You can add dependencies to the ``Dockerfile.ci``, ``setup.py`` or ``package.json`` and rebuild the image. -This should happen automatically if you modify any of these files. -After you exit the container and re-run ``breeze``, Breeze detects changes in dependencies, -asks you to confirm rebuilding the image and proceeds with rebuilding if you confirm (or skip it -if you do not confirm). After rebuilding is done, Breeze drops you to shell. You may also use the -``build-image`` command to only rebuild CI image and not to go into shell. +You can also prepare airflow packages using breeze: -Changing apt Dependencies in the Dockerfile.ci -.............................................. +.. code-block:: bash -During development, changing dependencies in ``apt-get`` closer to the top of the ``Dockerfile.ci`` -invalidates cache for most of the image. It takes long time for Breeze to rebuild the image. -So, it is a recommended practice to add new dependencies initially closer to the end -of the ``Dockerfile.ci``. This way dependencies will be added incrementally. + ./breeze prepare-airflow-packages -Before merge, these dependencies should be moved to the appropriate ``apt-get install`` command, -which is already in the ``Dockerfile.ci``. +This prepares airflow .whl package in the dist folder. -Port Forwarding ---------------- +Again, you can specify optional ``--package-format`` flag to build airflow packages. -When you run Airflow Breeze, the following ports are automatically forwarded: +.. code-block:: bash -* 28080 -> forwarded to Airflow webserver -> airflow:8080 -* 25433 -> forwarded to Postgres database -> postgres:5432 -* 23306 -> forwarded to MySQL database -> mysql:3306 + ./breeze prepare-airflow-packages --package-format=bot -You can connect to these ports/databases using: -* Webserver: ``http://127.0.0.1:28080`` -* Postgres: ``jdbc:postgresql://127.0.0.1:25433/airflow?user=postgres&password=airflow`` -* Mysql: ``jdbc:mysql://localhost:23306/airflow?user=root`` +Building Production images +-------------------------- -Start the webserver manually with the ``airflow webserver`` command if you want to connect -to the webserver. You can use ``tmux`` to multiply terminals. You may need to create a user prior to -running the webserver in order to log in. This can be done with the following command: +The **Production image** is also maintained on the DockerHub in the +```apache/airflow`` repository. This Docker image (and Dockerfile) contains size-optimised Airflow +installation with selected extras and dependencies. Its tag follows the pattern of +``-python`` (for example, ``apache/airflow:master-python3.6`` +or ``apache/airflow:v1-10-test-python3.6``). + +However in many cases you want to add your own custom version of the image - with added apt dependencies, +python dependencies, additional Airflow extras. Breeze's ``build-image`` command helps to build your own, +customized variant of the image that contains everything you need. + +You can switch to building the production image by adding ``--production-image`` flag to the ``build_image`` +command. Note, that the images can also be build using ``docker build`` command by passing appropriate +build-args as described in `IMAGES.rst `_ , but Breeze provides several flags that +makes it easier to do it. You can see all the flags by running ``./breeze build-image --help``, +but here typical examples are presented: .. code-block:: bash - airflow create_user --role Admin --username admin --password admin --email admin@example.com --firstname foo --lastname bar + ./breeze build-image --production-image --additional-extras "jira" -For databases, you need to run ``airflow resetdb`` at least once (or run some tests) after you started -Airflow Breeze to get the database/tables created. You can connect to databases with IDE or any other -database client: +This installs additional ``jira`` extra while installing airflow in the image. -.. image:: images/database_view.png - :align: center - :alt: Database view -You can change the used host port numbers by setting appropriate environment variables: +.. code-block:: bash -* ``WEBSERVER_HOST_PORT`` -* ``POSTGRES_HOST_PORT`` -* ``MYSQL_HOST_PORT`` + ./breeze build-image --production-image --additional-python-deps "torchio==0.17.10" + +This install additional pypi dependency - torchio in specified version. -If you set these variables, next time when you enter the environment the new ports should be in effect. -Setting Up Autocompletion -------------------------- +.. code-block:: bash -The ``breeze`` command comes with a built-in bash/zsh autocomplete option for its options. When you start typing -the command, you can use to show all the available switches and get autocompletion on typical -values of parameters that you can use. + ./breeze build-image --production-image --additional-dev-apt-deps "libasound2-dev" \ + --additional-runtime-apt-deps "libasound2" -You can set up the autocomplete option automatically by running: +This install additional apt dependencies - ``libasound2-dev`` in build image and ``libasound`` in the +final image. Those are development dependencies that might be needed to build and use python packages added +via the ``--additional-python-deps`` flag. The ``dev`` dependencies are not installed in the final +production image, they are only installed in the build "segment" of the production image that is used +as an intermediate step to build the final image. Usually names of the ``dev`` dependencies end with ``-dev`` +suffix and they need to also be paired with corresponding runtime dependency added for the runtime image +(without -dev). .. code-block:: bash - ./breeze setup-autocomplete + ./breeze build-image --production-image --python 3.7 --additional-dev-deps "libasound2-dev" \ + --additional-runtime-apt-deps "libasound2" -You get the autocompletion working when you re-enter the shell. +Same as above but uses python 3.7. -Zsh autocompletion is currently limited to only autocomplete options. Bash autocompletion also completes -options values (for example, Python version or static check name). +.. raw:: html -Setting Defaults for User Interaction --------------------------------------- +
+ + Airflow Breeze - Building Production images + +
-Sometimes during the build, you are asked whether to perform an action, skip it, or quit. This happens -when rebuilding or removing an image - actions that take a lot of time and could be potentially destructive. +Building Production images for 1.10 Airflow versions +---------------------------------------------------- -For automation scripts, you can export one of the three variables to control the default -interaction behaviour: +With Breeze you can also use the master Dockerfile to build custom images for released Airflow versions. +This works in the same way as building production image from master, but you need to add additional switch +``--install-airflow-version``. You should pass version of airflow (as released in PyPI). It can be used +to install both released versions and release candidates. Similarly as in case of master images, +we can pass additional extras/dependencies to install via the additional flags. -.. code-block:: +.. code-block:: bash - export FORCE_ANSWER_TO_QUESTIONS="yes" + ./breeze build-image --production-image --additional-extras "jira" --install-airflow-version="1.10.11" -If ``FORCE_ANSWER_TO_QUESTIONS`` is set to ``yes``, the images are automatically rebuilt when needed. -Images are deleted without asking. +Builds airflow image with released Airflow version 1.10.11 and additional extra "jira" added. -.. code-block:: +.. code-block:: bash - export FORCE_ANSWER_TO_QUESTIONS="no" + ./breeze build-image --production-image --install-airflow-version="1.10.11rc2" -If ``FORCE_ANSWER_TO_QUESTIONS`` is set to ``no``, the old images are used even if rebuilding is needed. -This is useful when you work offline. Deleting images is aborted. +Builds airflow image with released Airflow version 1.10.11rc2. -.. code-block:: - export FORCE_ANSWER_TO_QUESTIONS="quit" +You can also build airflow directly from GitHub source code - by providing Git Reference via +``--install-airflow-reference``. The reference can be a branch name, tag name, or commit hash. This +is useful mostly for testing. -If ``FORCE_ANSWER_TO_QUESTIONS`` is set to ``quit``, the whole script is aborted. Deleting images is aborted. +.. code-block:: bash -If more than one variable is set, ``yes`` takes precedence over ``no``, which takes precedence over ``quit``. + ./breeze build-image --production-image --install-airflow-reference="v1-10-test" + +This Builds airflow image from the current ``v1-10-test`` branch of Airflow. + +.. code-block:: bash + + ./breeze build-image --production-image \ + --install-airflow-reference="0d91fcf725f69e10f0969ca36f9e38e1d74110d0" + +This Builds airflow image from the ``0d91fcf725f69e10f0969ca36f9e38e1d74110d0`` commit hash on +GitHub. + +.. raw:: html + +
+ + Airflow Breeze - Building Production images for 1.10 Airflow versions + +
+ + +Running static checks +--------------------- + +You can run static checks via Breeze. You can also run them via pre-commit command but with auto-completion +Breeze makes it easier to run selective static checks. If you press after the static-check and if +you have auto-complete setup you should see auto-completable list of all checks available. + +.. code-block:: bash + + ./breeze static-check mypy + +The above will run mypy check for currently staged files. + +You can also add arbitrary pre-commit flag after ``--`` + +.. code-block:: bash + + ./breeze static-check mypy -- --all-files + +The above will run mypy check for all files. + +.. raw:: html + +
+ + Airflow Breeze - Static checks + +
+ +If you want ever need to get a list of the files that will be checked (for troubleshooting when playing with the +``--from-ref`` and ``--to-ref`` + +.. code-block:: bash + + breeze static-check identity --verbose # currently staged files + breeze static-check identity --verbose -- --from-ref $(git merge-base master HEAD) --to-ref HEAD # branch updates Building the Documentation -------------------------- @@ -547,31 +774,69 @@ Often errors during documentation generation come from the docstrings of auto-ap During the docs building auto-api generated files are stored in the ``docs/_api`` folder. This helps you easily identify the location the problems with documentation originated from. -Using Your Host IDE -=================== +.. raw:: html + +
+ + Airflow Breeze - Build docs + +
+ +Generating constraints +---------------------- + +Whenever setup.py gets modified, the CI master job will re-generate constraint files. Those constraint +files are stored in separated orphan branches: ``constraints-master`` and ``constraint-1-10``. +They are stored separately for each python version. Those are +constraint files as described in detail in the +``_ contributing documentation. + +In case someone modifies setup.py, the ``CRON`` scheduled CI build automatically upgrades and +pushes changed to the constraint files, however you can also perform test run of this locally using +``generate-constraints`` command of Breeze. + +.. code-block:: bash + + ./breeze generate-constraints --python 3.6 + +.. code-block:: bash + + ./breeze generate-constraints --python 3.7 + +.. code-block:: bash + + ./breeze generate-constraints --python 3.8 + +This bumps the constraint files to latest versions and stores hash of setup.py. The generated constraint +and setup.py hash files are stored in the ``files`` folder and while generating the constraints diff +of changes vs the previous constraint files is printed. + +Using local virtualenv environment in Your Host IDE +--------------------------------------------------- You can set up your host IDE (for example, IntelliJ's PyCharm/Idea) to work with Breeze and benefit from all the features provided by your IDE, such as local and remote debugging, -autocompletion, documentation support, etc. +language auto-completion, documentation support, etc. To use your host IDE with Breeze: -1. Create a local virtual environment as follows: +1. Create a local virtual environment: - ``mkvirtualenv --python=python`` - - You can use any of the following wrappers to create and manage your virtual environemnts: + You can use any of the following wrappers to create and manage your virtual environments: `pyenv `_, `pyenv-virtualenv `_, or `virtualenvwrapper `_. - Ideally, you should have virtualenvs for all Python versions supported by Airflow (2.7, 3.5, 3.6) - and switch between them with the ``workon`` command. + Ideally, you should have virtualenvs for all Python versions supported by Airflow (2.7, 3.5, 3.6, 3.7, 3.8) -2. Use the ``workon`` command to enter the Breeze environment. +2. Use the right command to activate the virtualenv (``workon`` if you use virtualenvwrapper or + ``pyenv activate`` if you use pyenv. 3. Initialize the created local virtualenv: - ``./breeze initialize-local-virtualenv`` +.. code-block:: bash + + ./breeze initialize-local-virtualenv --python 3.8 4. Select the virtualenv you created as the project's default virtualenv in your IDE. @@ -580,261 +845,1259 @@ This is a lightweight solution that has its own limitations. More details on using the local virtualenv are available in the `LOCAL_VIRTUALENV.rst `_. -Running static checks in Breeze -=============================== +.. raw:: html -The Breeze environment is also used to run some of the static checks as described in -`STATIC_CODE_CHECKS.rst `_. +
+ + Airflow Breeze - Initialize virtualenv + +
+Running Kubernetes tests +------------------------ -Running Tests in Breeze -======================= +Breeze helps with running Kubernetes tests in the same environment/way as CI tests are run. +Breeze helps to setup KinD cluster for testing, setting up virtualenv and downloads the right tools +automatically to run the tests. -As soon as you enter the Breeze environment, you can run Airflow unit tests via the ``pytest`` command. +This is described in detail in `Testing Kubernetes `_. -For supported CI test suites, types of unit tests, and other tests, see `TESTING.rst `_. +.. raw:: html -Breeze Command-Line Interface Reference -======================================= +
+ + Airflow Breeze - Kubernetes tests + +
-Airflow Breeze Syntax ---------------------- +Stopping the interactive environment +------------------------------------ + +After starting up, the environment runs in the background and takes precious memory. +You can always stop it via: + +.. code-block:: bash + + ./breeze stop + + +.. raw:: html + +
+ + Airflow Breeze - Stop environment + +
+ + +Internal details of Breeze +========================== + +Airflow directory structure inside container +-------------------------------------------- + +When you are in the CI container, the following directories are used: + +.. code-block:: text + + /opt/airflow - Contains sources of Airflow mounted from the host (AIRFLOW_SOURCES). + /root/airflow - Contains all the "dynamic" Airflow files (AIRFLOW_HOME), such as: + airflow.db - sqlite database in case sqlite is used; + dags - folder with non-test dags (test dags are in /opt/airflow/tests/dags); + logs - logs from Airflow executions; + unittest.cfg - unit test configuration generated when entering the environment; + webserver_config.py - webserver configuration generated when running Airflow in the container. + +Note that when running in your local environment, the ``/root/airflow/logs`` folder is actually mounted +from your ``logs`` directory in the Airflow sources, so all logs created in the container are automatically +visible in the host as well. Every time you enter the container, the ``logs`` directory is +cleaned so that logs do not accumulate. + +When you are in the production container, the following directories are used: + +.. code-block:: text + + /opt/airflow - Contains sources of Airflow mounted from the host (AIRFLOW_SOURCES). + /root/airflow - Contains all the "dynamic" Airflow files (AIRFLOW_HOME), such as: + airflow.db - sqlite database in case sqlite is used; + dags - folder with non-test dags (test dags are in /opt/airflow/tests/dags); + logs - logs from Airflow executions; + unittest.cfg - unit test configuration generated when entering the environment; + webserver_config.py - webserver configuration generated when running Airflow in the container. + +Note that when running in your local environment, the ``/root/airflow/logs`` folder is actually mounted +from your ``logs`` directory in the Airflow sources, so all logs created in the container are automatically +visible in the host as well. Every time you enter the container, the ``logs`` directory is +cleaned so that logs do not accumulate. + +Running Arbitrary commands in the Breeze environment +---------------------------------------------------- + +To run other commands/executables inside the Breeze Docker-based environment, use the +``./breeze shell`` command. You should add your command as -c "command" after ``--`` as extra arguments. + +.. code-block:: bash + + ./breeze shell -- -c "ls -la" + +Running "Docker Compose" commands +--------------------------------- + +To run Docker Compose commands (such as ``help``, ``pull``, etc), use the +``docker-compose`` command. To add extra arguments, specify them +after ``--`` as extra arguments. + +.. code-block:: bash + + ./breeze docker-compose pull -- --ignore-pull-failures + +Restarting Breeze environment +----------------------------- + +You can also restart the environment and enter it via: + +.. code-block:: bash + + ./breeze restart + + +Setting default answers for user interaction +-------------------------------------------- + +Sometimes during the build, you are asked whether to perform an action, skip it, or quit. This happens +when rebuilding or removing an image - actions that take a lot of time and could be potentially destructive. + +For automation scripts, you can export one of the three variables to control the default +interaction behaviour: + +.. code-block:: + + export FORCE_ANSWER_TO_QUESTIONS="yes" + +If ``FORCE_ANSWER_TO_QUESTIONS`` is set to ``yes``, the images are automatically rebuilt when needed. +Images are deleted without asking. + +.. code-block:: + + export FORCE_ANSWER_TO_QUESTIONS="no" + +If ``FORCE_ANSWER_TO_QUESTIONS`` is set to ``no``, the old images are used even if rebuilding is needed. +This is useful when you work offline. Deleting images is aborted. + +.. code-block:: + + export FORCE_ANSWER_TO_QUESTIONS="quit" + +If ``FORCE_ANSWER_TO_QUESTIONS`` is set to ``quit``, the whole script is aborted. Deleting images is aborted. + +If more than one variable is set, ``yes`` takes precedence over ``no``, which takes precedence over ``quit``. + +Fixing File/Directory Ownership +------------------------------- + +On Linux, there is a problem with propagating ownership of created files (a known Docker problem). The +files and directories created in the container are not owned by the host user (but by the root user in our +case). This may prevent you from switching branches, for example, if files owned by the root user are +created within your sources. In case you are on a Linux host and have some files in your sources created +by the root user, you can fix the ownership of those files by running this script: + +.. code-block:: + + ./scripts/ci/tools/ci_fix_ownership.sh + +Mounting Local Sources to Breeze +-------------------------------- + +Important sources of Airflow are mounted inside the ``airflow`` container that you enter. +This means that you can continue editing your changes on the host in your favourite IDE and have them +visible in the Docker immediately and ready to test without rebuilding images. You can disable mounting +by specifying ``--skip-mounting-local-sources`` flag when running Breeze. In this case you will have sources +embedded in the container and changes to these sources will not be persistent. + + +After you run Breeze for the first time, you will have empty directory ``files`` in your source code, +which will be mapped to ``/files`` in your Docker container. You can pass there any files you need to +configure and run Docker. They will not be removed between Docker runs. + +By default ``/files/dags`` folder is mounted from your local ``/files/dags`` and this is +the directory used by airflow scheduler and webserver to scan dags for. You can use it to test your dags +from local sources in Airflow. If you wish to add local DAGs that can be run by Breeze. + +Port Forwarding +--------------- + +When you run Airflow Breeze, the following ports are automatically forwarded: + +* 28080 -> forwarded to Airflow webserver -> airflow:8080 +* 25555 -> forwarded to Flower dashboard -> airflow:5555 +* 25433 -> forwarded to Postgres database -> postgres:5432 +* 23306 -> forwarded to MySQL database -> mysql:3306 +* 26379 -> forwarded to Redis broker -> redis:6379 + +You can connect to these ports/databases using: + +* Webserver: ``http://127.0.0.1:28080`` +* Flower: ``http://127.0.0.1:25555`` +* Postgres: ``jdbc:postgresql://127.0.0.1:25433/airflow?user=postgres&password=airflow`` +* Mysql: ``jdbc:mysql://127.0.0.1:23306/airflow?user=root`` +* Redis: ``redis://127.0.0.1:26379/0``` + +Start the webserver manually with the ``airflow webserver`` command if you want to connect +to the webserver. You can use ``tmux`` to multiply terminals. You may need to create a user prior to +running the webserver in order to log in. This can be done with the following command: + +.. code-block:: bash + + airflow users create --role Admin --username admin --password admin --email admin@example.com --firstname foo --lastname bar + +For databases, you need to run ``airflow db reset`` at least once (or run some tests) after you started +Airflow Breeze to get the database/tables created. You can connect to databases with IDE or any other +database client: + + +.. raw:: html + +
+ Airflow Breeze - Database view +
+ +You can change the used host port numbers by setting appropriate environment variables: + +* ``WEBSERVER_HOST_PORT`` +* ``POSTGRES_HOST_PORT`` +* ``MYSQL_HOST_PORT`` + +If you set these variables, next time when you enter the environment the new ports should be in effect. + +Managing Dependencies +--------------------- + +If you need to change apt dependencies in the ``Dockerfile.ci``, add Python packages in ``setup.py`` or +add JavaScript dependencies in ``package.json``, you can either add dependencies temporarily for a single +Breeze session or permanently in ``setup.py``, ``Dockerfile.ci``, or ``package.json`` files. + +Installing Dependencies for a Single Breeze Session +................................................... + +You can install dependencies inside the container using ``sudo apt install``, ``pip install`` or +``yarn install`` (in ``airflow/www`` folder) respectively. This is useful if you want to test something +quickly while you are in the container. However, these changes are not retained: they disappear once you +exit the container (except for the node.js dependencies if your sources are mounted to the container). +Therefore, if you want to retain a new dependency, follow the second option described below. + +Adding Dependencies Permanently +............................... + +You can add dependencies to the ``Dockerfile.ci``, ``setup.py`` or ``package.json`` and rebuild the image. +This should happen automatically if you modify any of these files. +After you exit the container and re-run ``breeze``, Breeze detects changes in dependencies, +asks you to confirm rebuilding the image and proceeds with rebuilding if you confirm (or skip it +if you do not confirm). After rebuilding is done, Breeze drops you to shell. You may also use the +``build-image`` command to only rebuild CI image and not to go into shell. + +Incremental apt Dependencies in the Dockerfile.ci during development +.................................................................... + +During development, changing dependencies in ``apt-get`` closer to the top of the ``Dockerfile.ci`` +invalidates cache for most of the image. It takes long time for Breeze to rebuild the image. +So, it is a recommended practice to add new dependencies initially closer to the end +of the ``Dockerfile.ci``. This way dependencies will be added incrementally. + +Before merge, these dependencies should be moved to the appropriate ``apt-get install`` command, +which is already in the ``Dockerfile.ci``. + + +Breeze Command-Line Interface Reference +======================================= + +Airflow Breeze Syntax +--------------------- + +This is the current syntax for `./breeze <./breeze>`_: + + .. START BREEZE HELP MARKER + +.. code-block:: text + + + #################################################################################################### + + usage: breeze [FLAGS] [COMMAND] -- + + By default the script enters the CI container and drops you to bash shell, but you can choose + one of the commands to run specific actions instead. + + Add --help after each command to see details: + + Commands without arguments: + + shell [Default] Enters interactive shell in the container + build-docs Builds documentation in the container + build-image Builds CI or Production docker image + cleanup-image Cleans up the container image created + exec Execs into running breeze container in new terminal + generate-constraints Generates pinned constraint files + push-image Pushes images to registry + initialize-local-virtualenv Initializes local virtualenv + prepare-airflow-packages Prepares airflow packages + setup-autocomplete Sets up autocomplete for breeze + start-airflow Starts Scheduler and Webserver and enters the shell + stop Stops the docker-compose environment + restart Stops the docker-compose environment including DB cleanup + toggle-suppress-cheatsheet Toggles on/off cheatsheet + toggle-suppress-asciiart Toggles on/off asciiart + + Commands with arguments: + + docker-compose Executes specified docker-compose command + kind-cluster Manages KinD cluster on the host + static-check Performs selected static check for changed files + tests Runs selected tests in the container + + Help commands: + + flags Shows all breeze's flags + help Shows this help message + help-all Shows detailed help for all commands and flags + + #################################################################################################### + + Detailed usage + + #################################################################################################### + + + Detailed usage for command: shell + + + breeze shell [FLAGS] [-- ] + + This is default subcommand if no subcommand is used. + + Enters interactive shell where you can run all tests, start Airflow webserver, scheduler, + workers, interact with the database, run DAGs etc. It is the default command if no command + is selected. The shell is executed in the container and in case integrations are chosen, + the integrations will be started as separated docker containers - under the docker-compose + supervision. Local sources are by default mounted to within the container so you can edit + them locally and run tests immediately in the container. Several folders ('files', 'dist') + are also mounted so that you can exchange files between the host and container. + + The 'files/airflow-breeze-config/variables.env' file can contain additional variables + and setup. This file is automatically sourced when you enter the container. Database + and webserver ports are forwarded to appropriate database/webserver so that you can + connect to it from your host environment. + + You can also pass after -- they will be passed as bash parameters, this is + especially useful to pass bash options, for example -c to execute command: + + 'breeze shell -- -c "ls -la"' + 'breeze -- -c "ls -la"' + + For DockerHub pull --dockerhub-user and --dockerhub-repo flags can be used to specify + the repository to pull from. For GitHub repository, the --github-repository + flag can be used for the same purpose. You can also use + --github-image-id | in case you want to pull the image + with specific COMMIT_SHA tag or RUN_ID. + + 'breeze shell \ + --github-image-id 9a621eaa394c0a0a336f8e1b31b35eff4e4ee86e' - pull/use image with SHA + 'breeze \ + --github-image-id 9a621eaa394c0a0a336f8e1b31b35eff4e4ee86e' - pull/use image with SHA + 'breeze shell \ + --github-image-id 209845560' - pull/use image with RUN_ID + 'breeze \ + --github-image-id 209845560' - pull/use image with RUN_ID + + + #################################################################################################### + + + Detailed usage for command: build-docs + + + breeze build-docs + + Builds Airflow documentation. The documentation is build inside docker container - to + maintain the same build environment for everyone. Appropriate sources are mapped from + the host to the container so that latest sources are used. The folders where documentation + is generated ('docs/_build') are also mounted to the container - this way results of + the documentation build is available in the host. + + + #################################################################################################### + + + Detailed usage for command: build-image + + + breeze build-image [FLAGS] + + Builds docker image (CI or production) without entering the container. You can pass + additional options to this command, such as '--force-build-image', + '--force-pull-image', '--python', '--build-cache-local' or '-build-cache-pulled' + in order to modify build behaviour. + + You can also pass '--production-image' flag to build production image rather than CI image. + + For DockerHub pull --dockerhub-user and --dockerhub-repo flags can be used to specify + the repository to pull from. For GitHub repository, the --github-repository + flag can be used for the same purpose. You can also use + --github-image-id | in case you want to pull the image with + specific COMMIT_SHA tag or RUN_ID. + + Flags: + + -p, --python PYTHON_MAJOR_MINOR_VERSION + Python version used for the image. This is always major/minor version. + + One of: + + 2.7 3.5 3.6 3.7 3.8 + + -a, --install-airflow-version INSTALL_AIRFLOW_VERSION + If specified, installs Airflow directly from PIP released version. This happens at + image building time in production image and at container entering time for CI image. One of: + + 1.10.15 1.10.14 1.10.12 1.10.11 1.10.10 1.10.9 none wheel sdist + + When 'none' is used, you can install airflow from local packages. When building image, + airflow package should be added to 'docker-context-files' and + --install-from-docker-context-files flag should be used. When running an image, airflow + package should be added to dist folder and --install-packages-from-dist flag should be used. + + -t, --install-airflow-reference INSTALL_AIRFLOW_REFERENCE + If specified, installs Airflow directly from reference in GitHub. This happens at + image building time in production image and at container entering time for CI image. + This can be a GitHub branch like master or v1-10-test, or a tag like 2.0.0a1. + + --no-rbac-ui + Disables RBAC UI when Airflow 1.10.* is installed. + + --install-packages-from-dist + If specified it will look for packages placed in dist folder and it will install the + packages after installing Airflow. This is useful for testing provider + packages. + + -I, --production-image + Use production image for entering the environment and builds (not for tests). + + -F, --force-build-images + Forces building of the local docker images. The images are rebuilt + automatically for the first time or when changes are detected in + package-related files, but you can force it using this flag. + + -P, --force-pull-images + Forces pulling of images from DockerHub before building to populate cache. The + images are pulled by default only for the first time you run the + environment, later the locally build images are used as cache. + + Customization options: + + -E, --extras EXTRAS + Extras to pass to build images The default are different for CI and production images: + + CI image: + devel_ci + + Production image: + async,aws,azure,celery,dask,elasticsearch,gcp,kubernetes,mysql,postgres,redis,slack, + ssh,statsd,virtualenv + + --image-tag TAG + Additional tag in the image. + + --disable-pypi-when-building + Disable installing Airflow from pypi when building. If you use this flag and want + to install Airflow, you have to install it from packages placed in + 'docker-context-files' and use --install-from-local-files-when-building flag. + + --additional-extras ADDITIONAL_EXTRAS + Additional extras to pass to build images The default is no additional extras. + + --additional-python-deps ADDITIONAL_PYTHON_DEPS + Additional python dependencies to use when building the images. + + --dev-apt-command DEV_APT_COMMAND + The basic command executed before dev apt deps are installed. + + --additional-dev-apt-command ADDITIONAL_DEV_APT_COMMAND + Additional command executed before dev apt deps are installed. + + --additional-dev-apt-deps ADDITIONAL_DEV_APT_DEPS + Additional apt dev dependencies to use when building the images. + + --dev-apt-deps DEV_APT_DEPS + The basic apt dev dependencies to use when building the images. + + --additional-dev-apt-deps ADDITIONAL_DEV_DEPS + Additional apt dev dependencies to use when building the images. + + --additional-dev-apt-envs ADDITIONAL_DEV_APT_ENVS + Additional environment variables set when adding dev dependencies. + + --runtime-apt-command RUNTIME_APT_COMMAND + The basic command executed before runtime apt deps are installed. + + --additional-runtime-apt-command ADDITIONAL_RUNTIME_APT_COMMAND + Additional command executed before runtime apt deps are installed. + + --runtime-apt-deps ADDITIONAL_RUNTIME_APT_DEPS + The basic apt runtime dependencies to use when building the images. + + --additional-runtime-apt-deps ADDITIONAL_RUNTIME_DEPS + Additional apt runtime dependencies to use when building the images. + + --additional-runtime-apt-envs ADDITIONAL_RUNTIME_APT_DEPS + Additional environment variables set when adding runtime dependencies. + + Build options: + + --disable-mysql-client-installation + Disables installation of the mysql client which might be problematic if you are building + image in controlled environment. Only valid for production image. + + --constraints-location + Url to the constraints file. In case of the production image it can also be a path to the + constraint file placed in 'docker-context-files' folder, in which case it has to be + in the form of '/docker-context-files/' + + --disable-pip-cache + Disables GitHub PIP cache during the build. Useful if GitHub is not reachable during build. + + --install-from-local-files-when-building + This flag is used during image building. If it is used additionally to installing + Airflow from PyPI, the packages are installed from the .whl and .tar.gz packages placed + in the 'docker-context-files' folder. The same flag can be used during entering the image in + the CI image - in this case also the .whl and .tar.gz files will be installed automatically + + -C, --force-clean-images + Force build images with cache disabled. This will remove the pulled or build images + and start building images from scratch. This might take a long time. + + -r, --skip-rebuild-check + Skips checking image for rebuilds. It will use whatever image is available locally/pulled. + + -L, --build-cache-local + Uses local cache to build images. No pulled images will be used, but results of local + builds in the Docker cache are used instead. This will take longer than when the pulled + cache is used for the first time, but subsequent '--build-cache-local' builds will be + faster as they will use mostly the locally build cache. + + This is default strategy used by the Production image builds. + + -U, --build-cache-pulled + Uses images pulled from registry (either DockerHub or GitHub depending on + --github-registry flag) to build images. The pulled images will be used as cache. + Those builds are usually faster than when ''--build-cache-local'' with the exception if + the registry images are not yet updated. The DockerHub images are updated nightly and the + GitHub images are updated after merges to master so it might be that the images are still + outdated vs. the latest version of the Dockerfiles you are using. In this case, the + ''--build-cache-local'' might be faster, especially if you iterate and change the + Dockerfiles yourself. + + This is default strategy used by the CI image builds. + + -X, --build-cache-disabled + Disables cache during docker builds. This is useful if you want to make sure you want to + rebuild everything from scratch. + + This strategy is used by default for both Production and CI images for the scheduled + (nightly) builds in CI. + + -D, --dockerhub-user DOCKERHUB_USER + DockerHub user used to pull, push and build images. Default: apache. + + -H, --dockerhub-repo DOCKERHUB_REPO + DockerHub repository used to pull, push, build images. Default: airflow. + + -c, --github-registry GITHUB_REGISTRY + If GitHub registry is enabled, pulls and pushes are done from the GitHub registry not + DockerHub. You need to be logged in to the registry in order to be able to pull/push from + and you need to be committer to push to Apache Airflow' GitHub registry. + + -g, --github-repository GITHUB_REPOSITORY + GitHub repository used to pull, push images when cache is used. + Default: apache/airflow. + + If you use this flag, automatically --github-registry flag is enabled. + + -s, --github-image-id COMMIT_SHA|RUN_ID + or of the image. Images in GitHub registry are stored with those + to be able to easily find the image for particular CI runs. Once you know the + or , you can specify it in github-image-id flag and Breeze will + automatically pull and use that image so that you can easily reproduce a problem + that occurred in CI. + + If you use this flag, automatically --github-registry is enabled. + + + Default: latest. + + -v, --verbose + Show verbose information about executed docker, kind, kubectl, helm commands. Useful for + debugging - when you run breeze with --verbose flags you will be able to see the commands + executed under the hood and copy&paste them to your terminal to debug them more easily. + + Note that you can further increase verbosity and see all the commands executed by breeze + by running 'export VERBOSE_COMMANDS="true"' before running breeze. + + + #################################################################################################### + + + Detailed usage for command: cleanup-image + + + breeze cleanup-image [FLAGS] + + Removes the breeze-related images created in your local docker image cache. This will + not reclaim space in docker cache. You need to 'docker system prune' (optionally + with --all) to reclaim that space. + + Flags: + + -p, --python PYTHON_MAJOR_MINOR_VERSION + Python version used for the image. This is always major/minor version. + + One of: + + 2.7 3.5 3.6 3.7 3.8 + + -I, --production-image + Use production image for entering the environment and builds (not for tests). + + -v, --verbose + Show verbose information about executed docker, kind, kubectl, helm commands. Useful for + debugging - when you run breeze with --verbose flags you will be able to see the commands + executed under the hood and copy&paste them to your terminal to debug them more easily. + + Note that you can further increase verbosity and see all the commands executed by breeze + by running 'export VERBOSE_COMMANDS="true"' before running breeze. + + + #################################################################################################### + + + Detailed usage for command: exec + + + breeze exec [-- ] + + Execs into interactive shell to an already running container. The container mus be started + already by breeze shell command. If you are not familiar with tmux, this is the best + way to run multiple processes in the same container at the same time for example scheduler, + webserver, workers, database console and interactive terminal. + + + #################################################################################################### + + + Detailed usage for command: generate-constraints + + + breeze generate-constraints [FLAGS] + + Generates pinned constraint files from setup.py. Those files are generated in files folder + - separate files for different python version. Those constraint files when pushed to orphan + constraint-master and constraint-1-10 branches are used to generate repeatable + CI builds as well as run repeatable production image builds. You can use those constraints + to predictably install released Airflow versions. This is mainly used to test the constraint + generation - constraints are pushed to the orphan branches by a successful scheduled + CRON job in CI automatically. + + Flags: + + -p, --python PYTHON_MAJOR_MINOR_VERSION + Python version used for the image. This is always major/minor version. + + One of: + + 2.7 3.5 3.6 3.7 3.8 + + -v, --verbose + Show verbose information about executed docker, kind, kubectl, helm commands. Useful for + debugging - when you run breeze with --verbose flags you will be able to see the commands + executed under the hood and copy&paste them to your terminal to debug them more easily. + + Note that you can further increase verbosity and see all the commands executed by breeze + by running 'export VERBOSE_COMMANDS="true"' before running breeze. + + + #################################################################################################### + + + Detailed usage for command: push-image + + + breeze push_image [FLAGS] + + Pushes images to docker registry. You can push the images to DockerHub registry (default) + or to the GitHub registry (if --github-registry flag is used). + + For DockerHub pushes --dockerhub-user and --dockerhub-repo flags can be used to specify + the repository to push to. For GitHub repository, the --github-repository + flag can be used for the same purpose. You can also add + --github-image-id | in case you want to push image with specific + SHA tag or run id. In case you specify --github-repository or --github-image-id, you + do not need to specify --github-registry flag. + + You can also add --production-image flag to switch to production image (default is CI one) + + Examples: + + 'breeze push-image' or + 'breeze push-image --dockerhub-user user' to push to your private registry or + 'breeze push-image --production-image' - to push production image or + 'breeze push-image --github-registry' - to push to GitHub image registry or + 'breeze push-image \ + --github-repository user/airflow' - to push to your user's fork + 'breeze push-image \ + --github-image-id 9a621eaa394c0a0a336f8e1b31b35eff4e4ee86e' - to push with COMMIT_SHA + 'breeze push-image \ + --github-image-id 209845560' - to push with RUN_ID + + Flags: + + -D, --dockerhub-user DOCKERHUB_USER + DockerHub user used to pull, push and build images. Default: apache. + + -H, --dockerhub-repo DOCKERHUB_REPO + DockerHub repository used to pull, push, build images. Default: airflow. + + -c, --github-registry GITHUB_REGISTRY + If GitHub registry is enabled, pulls and pushes are done from the GitHub registry not + DockerHub. You need to be logged in to the registry in order to be able to pull/push from + and you need to be committer to push to Apache Airflow' GitHub registry. + + -g, --github-repository GITHUB_REPOSITORY + GitHub repository used to pull, push images when cache is used. + Default: apache/airflow. + + If you use this flag, automatically --github-registry flag is enabled. + + -s, --github-image-id COMMIT_SHA|RUN_ID + or of the image. Images in GitHub registry are stored with those + to be able to easily find the image for particular CI runs. Once you know the + or , you can specify it in github-image-id flag and Breeze will + automatically pull and use that image so that you can easily reproduce a problem + that occurred in CI. + + If you use this flag, automatically --github-registry is enabled. + + + Default: latest. + + -v, --verbose + Show verbose information about executed docker, kind, kubectl, helm commands. Useful for + debugging - when you run breeze with --verbose flags you will be able to see the commands + executed under the hood and copy&paste them to your terminal to debug them more easily. + + Note that you can further increase verbosity and see all the commands executed by breeze + by running 'export VERBOSE_COMMANDS="true"' before running breeze. + + + #################################################################################################### + + + Detailed usage for command: initialize-local-virtualenv + + + breeze initialize-local-virtualenv [FLAGS] + + Initializes locally created virtualenv installing all dependencies of Airflow + taking into account the constraints for the version specified. + This local virtualenv can be used to aid auto-completion and IDE support as + well as run unit tests directly from the IDE. You need to have virtualenv + activated before running this command. + + Flags: + + -p, --python PYTHON_MAJOR_MINOR_VERSION + Python version used for the image. This is always major/minor version. + + One of: + + 2.7 3.5 3.6 3.7 3.8 + + + #################################################################################################### + + + Detailed usage for command: prepare-airflow-packages + + + breeze prepare-airflow-packages [FLAGS] + + Prepares airflow packages (sdist and wheel) in dist folder. Note that + prepare-provider-packages command cleans up the dist folder, so if you want also + to generate provider packages, make sure you run prepare-provider-packages first, + and prepare-airflow-packages second. + + General form: + + 'breeze prepare-airflow-packages + + Flags: + + --package-format PACKAGE_FORMAT + + Chooses format of packages to prepare. + + One of: + + wheel,sdist,both + + Default:  + + -S, --version-suffix-for-pypi SUFFIX + Adds optional suffix to the version in the generated backport package. It can be used + to generate rc1/rc2 ... versions of the packages to be uploaded to PyPI. + + -N, --version-suffix-for-svn SUFFIX + Adds optional suffix to the generated names of package. It can be used to generate + rc1/rc2 ... versions of the packages to be uploaded to SVN. + + -v, --verbose + Show verbose information about executed docker, kind, kubectl, helm commands. Useful for + debugging - when you run breeze with --verbose flags you will be able to see the commands + executed under the hood and copy&paste them to your terminal to debug them more easily. + + Note that you can further increase verbosity and see all the commands executed by breeze + by running 'export VERBOSE_COMMANDS="true"' before running breeze. + + + #################################################################################################### + + + Detailed usage for command: setup-autocomplete + + + breeze setup-autocomplete + + Sets up autocomplete for breeze commands. Once you do it you need to re-enter the bash + shell and when typing breeze command will provide autocomplete for + parameters and values. + + + #################################################################################################### + + + Detailed usage for command: start-airflow + + + breeze start-airflow + + Like the Shell command this will enter the interactive shell, but it will also start + automatically the Scheduler and the Webserver. It will leave you in a tmux session where you + can also observe what is happening in your Airflow. + + This is a convenient way to setup a development environment. Your dags will be loaded from the + folder 'files/dags' on your host machine (it could take some times). + + If you want to load default connections and example dags you can use the dedicated flags. + + Flags: + + --load-example-dags + Include Airflow example dags. + + --load-default-connections + Include Airflow Default Connections. + + + #################################################################################################### + + + Detailed usage for command: stop + + + breeze stop + + Brings down running docker compose environment. When you start the environment, the docker + containers will continue running so that startup time is shorter. But they take quite a lot of + memory and CPU. This command stops all running containers from the environment. + + Flags: + + --preserve-volumes + Use this flag if you would like to preserve data volumes from the databases used + by the integrations. By default, those volumes are deleted, so when you run 'stop' + or 'restart' commands you start from scratch, but by using this flag you can + preserve them. If you want to delete those volumes after stopping Breeze, just + run the 'breeze stop' again without this flag. + + + #################################################################################################### + + + Detailed usage for command: restart + + + breeze restart [FLAGS] + + Restarts running docker compose environment. When you restart the environment, the docker + containers will be restarted. That includes cleaning up the databases. This is + especially useful if you switch between different versions of Airflow. + + Flags: + + --preserve-volumes + Use this flag if you would like to preserve data volumes from the databases used + by the integrations. By default, those volumes are deleted, so when you run 'stop' + or 'restart' commands you start from scratch, but by using this flag you can + preserve them. If you want to delete those volumes after stopping Breeze, just + run the 'breeze stop' again without this flag. + + + #################################################################################################### + + + Detailed usage for command: toggle-suppress-cheatsheet + + + breeze toggle-suppress-cheatsheet + + Toggles on/off cheatsheet displayed before starting bash shell. + + + #################################################################################################### + + + Detailed usage for command: toggle-suppress-asciiart + + + breeze toggle-suppress-asciiart + + Toggles on/off asciiart displayed before starting bash shell. + + + #################################################################################################### + + + Detailed usage for command: docker-compose + + + breeze docker-compose [FLAGS] COMMAND [-- ] + + Run docker-compose command instead of entering the environment. Use 'help' as command + to see available commands. The passed after -- are treated + as additional options passed to docker-compose. For example + + 'breeze docker-compose pull -- --ignore-pull-failures' + + Flags: + + -p, --python PYTHON_MAJOR_MINOR_VERSION + Python version used for the image. This is always major/minor version. + + One of: + + 2.7 3.5 3.6 3.7 3.8 + + -b, --backend BACKEND + Backend to use for tests - it determines which database is used. + One of: + + sqlite mysql postgres + + Default: sqlite + + --postgres-version POSTGRES_VERSION + Postgres version used. One of: + + 9.6 10 11 12 13 + + --mysql-version MYSQL_VERSION + Mysql version used. One of: + + 5.6 5.7 + + -v, --verbose + Show verbose information about executed docker, kind, kubectl, helm commands. Useful for + debugging - when you run breeze with --verbose flags you will be able to see the commands + executed under the hood and copy&paste them to your terminal to debug them more easily. + + Note that you can further increase verbosity and see all the commands executed by breeze + by running 'export VERBOSE_COMMANDS="true"' before running breeze. + + + #################################################################################################### + + + Detailed usage for command: kind-cluster + + + breeze kind-cluster [FLAGS] OPERATION + + Manages host-side Kind Kubernetes cluster that is used to run Kubernetes integration tests. + It allows to start/stop/restart/status the Kind Kubernetes cluster and deploy Airflow to it. + This enables you to run tests inside the breeze environment with latest airflow images. + Note that in case of deploying airflow, the first step is to rebuild the image and loading it + to the cluster so you can also pass appropriate build image flags that will influence + rebuilding the production image. Operation is one of: + + start stop restart status deploy test shell k9s + + The last two operations - shell and k9s allow you to perform interactive testing with + kubernetes tests. You can enter the shell from which you can run kubernetes tests and in + another terminal you can start the k9s CLI to debug kubernetes instance. It is an easy + way to debug the kubernetes deployments. + + You can read more about k9s at https://k9scli.io/ + + Flags: + + -p, --python PYTHON_MAJOR_MINOR_VERSION + Python version used for the image. This is always major/minor version. + + One of: + + 2.7 3.5 3.6 3.7 3.8 + + -F, --force-build-images + Forces building of the local docker images. The images are rebuilt + automatically for the first time or when changes are detected in + package-related files, but you can force it using this flag. + + -P, --force-pull-images + Forces pulling of images from DockerHub before building to populate cache. The + images are pulled by default only for the first time you run the + environment, later the locally build images are used as cache. + + Customization options: + + -E, --extras EXTRAS + Extras to pass to build images The default are different for CI and production images: -This is the current syntax for `./breeze <./breeze>`_: + CI image: + devel_ci - .. START BREEZE HELP MARKER + Production image: + async,aws,azure,celery,dask,elasticsearch,gcp,kubernetes,mysql,postgres,redis,slack, + ssh,statsd,virtualenv -.. code-block:: text + --image-tag TAG + Additional tag in the image. + --disable-pypi-when-building + Disable installing Airflow from pypi when building. If you use this flag and want + to install Airflow, you have to install it from packages placed in + 'docker-context-files' and use --install-from-local-files-when-building flag. - #################################################################################################### + --additional-extras ADDITIONAL_EXTRAS + Additional extras to pass to build images The default is no additional extras. - Usage: breeze [FLAGS] [COMMAND] -- + --additional-python-deps ADDITIONAL_PYTHON_DEPS + Additional python dependencies to use when building the images. - By default the script enters IT environment and drops you to bash shell, but you can choose one - of the commands to run specific actions instead. Add --help after each command to see details: + --dev-apt-command DEV_APT_COMMAND + The basic command executed before dev apt deps are installed. - Commands without arguments: + --additional-dev-apt-command ADDITIONAL_DEV_APT_COMMAND + Additional command executed before dev apt deps are installed. - shell [Default] Enters interactive shell in the container - build-docs Builds documentation in the container - build-image Builds CI or Production docker image - cleanup-image Cleans up the container image created - exec Execs into running breeze container in new terminal - generate-requirements Generates pinned requirements for pip dependencies - initialize-local-virtualenv Initializes local virtualenv - setup-autocomplete Sets up autocomplete for breeze - stop Stops the docker-compose evironment - restart Stops the docker-compose evironment including DB cleanup - toggle-suppress-cheatsheet Toggles on/off cheatsheet - toggle-suppress-asciiart Toggles on/off asciiart + --additional-dev-apt-deps ADDITIONAL_DEV_APT_DEPS + Additional apt dev dependencies to use when building the images. - Commands with arguments: + --dev-apt-deps DEV_APT_DEPS + The basic apt dev dependencies to use when building the images. - docker-compose Executes specified docker-compose command - execute-command Executes specified command in the container - static-check Performs selected static check for changed files - static-check-all-files Performs selected static check for all files - test-target Runs selected test target in the container + --additional-dev-apt-deps ADDITIONAL_DEV_DEPS + Additional apt dev dependencies to use when building the images. - Help commands: + --additional-dev-apt-envs ADDITIONAL_DEV_APT_ENVS + Additional environment variables set when adding dev dependencies. - flags Shows all breeze's flags - help Shows this help message - help-all Shows detailed help for all commands and flags + --runtime-apt-command RUNTIME_APT_COMMAND + The basic command executed before runtime apt deps are installed. - #################################################################################################### + --additional-runtime-apt-command ADDITIONAL_RUNTIME_APT_COMMAND + Additional command executed before runtime apt deps are installed. - Detailed usage + --runtime-apt-deps ADDITIONAL_RUNTIME_APT_DEPS + The basic apt runtime dependencies to use when building the images. - #################################################################################################### + --additional-runtime-apt-deps ADDITIONAL_RUNTIME_DEPS + Additional apt runtime dependencies to use when building the images. - breeze [FLAGS] shell -- + --additional-runtime-apt-envs ADDITIONAL_RUNTIME_APT_DEPS + Additional environment variables set when adding runtime dependencies. - This is default subcommand if no subcommand is used. + Build options: - Enters interactive shell where you can run all tests, start airflow webserver, scheduler, - workers, interact with the database, run DAGs etc. It is the default command if no command - is selected. The shell is executed in the container and in case integrations are chosen, - the integrations will be started as separated docker containers - under the docker-compose - supervision. Local sources are by default mounted to within the container so you can edit - them locally and run tests immediately in the container. Several folders ('files', 'dist') - are also mounted so that you can exchange files between the host and container. + --disable-mysql-client-installation + Disables installation of the mysql client which might be problematic if you are building + image in controlled environment. Only valid for production image. - The 'files/airflow-breeze-config/variables.env' file can contain additional variables - and setup. This file is automatically sourced when you enter the container. Database - and webserver ports are forwarded to appropriate database/webserver so that you can - connect to it from your host environment. - **************************************************************************************************** - breeze [FLAGS] build-docs -- + --constraints-location + Url to the constraints file. In case of the production image it can also be a path to the + constraint file placed in 'docker-context-files' folder, in which case it has to be + in the form of '/docker-context-files/' - Builds airflow documentation. The documentation is build inside docker container - to - maintain the same build environment for everyone. Appropriate sources are mapped from - the host to the container so that latest sources are used. The folders where documentation - is generated ('docs/build') are also mounted to the container - this way results of - the documentation build is available in the host. - **************************************************************************************************** - breeze [FLAGS] build-image -- + --disable-pip-cache + Disables GitHub PIP cache during the build. Useful if GitHub is not reachable during build. - Builds docker image (CI or production) without entering the container. You can pass - aditional options to this command, such as '--force-build-image', - '--force-pull-image' '--python' '--use-local-cache'' in order to modify build behaviour. - You can also pass '--production-image' flag to build production image rather than CI image. - **************************************************************************************************** - breeze [FLAGS] cleanup-image -- + --install-from-local-files-when-building + This flag is used during image building. If it is used additionally to installing + Airflow from PyPI, the packages are installed from the .whl and .tar.gz packages placed + in the 'docker-context-files' folder. The same flag can be used during entering the image in + the CI image - in this case also the .whl and .tar.gz files will be installed automatically - Removes the breeze-related images created in your local docker image cache. This will - not reclaim space in docker cache. You need to 'docker system prune' (optionally - with --all) to reclaim that space. - **************************************************************************************************** - breeze [FLAGS] exec -- + -C, --force-clean-images + Force build images with cache disabled. This will remove the pulled or build images + and start building images from scratch. This might take a long time. - Execs into interactive shell to an already running container. The container mus be started - already by breeze shell command. If you are not familiar with tmux, this is the best - way to run multiple processes in the same container at the same time for example scheduler, - webserver, workers, database console and interactive terminal. - **************************************************************************************************** - breeze [FLAGS] generate-requirements -- + -r, --skip-rebuild-check + Skips checking image for rebuilds. It will use whatever image is available locally/pulled. - Generates pinned requirements from setup.py. Those requirements are generated in requirements - directory - separately for different python version. Those requirements are used to run - CI builds as well as run repeatable production image builds. You can use those requirements - to predictably install released airflow versions. You should run it always after you update - setup.py. - **************************************************************************************************** - breeze [FLAGS] initialize-local-virtualenv -- + -L, --build-cache-local + Uses local cache to build images. No pulled images will be used, but results of local + builds in the Docker cache are used instead. This will take longer than when the pulled + cache is used for the first time, but subsequent '--build-cache-local' builds will be + faster as they will use mostly the locally build cache. - Initializes locally created virtualenv installing all dependencies of Airflow - taking into account the frozen requirements from requirements folder. - This local virtualenv can be used to aid autocompletion and IDE support as - well as run unit tests directly from the IDE. You need to have virtualenv - activated before running this command. - **************************************************************************************************** - breeze [FLAGS] setup-autocomplete -- + This is default strategy used by the Production image builds. - Sets up autocomplete for breeze commands. Once you do it you need to re-enter the bash - shell and when typing breeze command will provide autocomplete for - parameters and values. - **************************************************************************************************** - breeze [FLAGS] stop -- + -U, --build-cache-pulled + Uses images pulled from registry (either DockerHub or GitHub depending on + --github-registry flag) to build images. The pulled images will be used as cache. + Those builds are usually faster than when ''--build-cache-local'' with the exception if + the registry images are not yet updated. The DockerHub images are updated nightly and the + GitHub images are updated after merges to master so it might be that the images are still + outdated vs. the latest version of the Dockerfiles you are using. In this case, the + ''--build-cache-local'' might be faster, especially if you iterate and change the + Dockerfiles yourself. - Brings down running docker compose environment. When you start the environment, the docker - containers will continue running so that startup time is shorter. But they take quite a lot of - memory and CPU. This command stops all running containers from the environment. - **************************************************************************************************** - breeze [FLAGS] restart -- + This is default strategy used by the CI image builds. - Restarts running docker compose environment. When you restart the environment, the docker - containers will be restarted. That includes cleaning up the databases. This is - especially useful if you switch between different versions of airflow. - **************************************************************************************************** - breeze [FLAGS] toggle-suppress-cheatsheet -- + -X, --build-cache-disabled + Disables cache during docker builds. This is useful if you want to make sure you want to + rebuild everything from scratch. - Toggles on/off cheatsheet displayed before starting bash shell. - **************************************************************************************************** - breeze [FLAGS] toggle-suppress-asciiart -- + This strategy is used by default for both Production and CI images for the scheduled + (nightly) builds in CI. - Toggles on/off asciiart displayed before starting bash shell. - **************************************************************************************************** - breeze [FLAGS] docker-compose -- - Run docker-compose command instead of entering the environment. Use 'help' as command - to see available commands. The passed after -- are treated - as additional options passed to docker-compose. For example + #################################################################################################### - 'breeze docker-compose pull -- --ignore-pull-failures' - **************************************************************************************************** - breeze [FLAGS] execute-command -- - Run chosen command instead of entering the environment. The command is run using - 'bash -c "" if you need to pass arguments to your command, you need - to pass them together with command surrounded with " or '. Alternatively you can - pass arguments as passed after --. For example: + Detailed usage for command: static-check - 'breeze execute-command "ls -la"' or - 'breeze execute-command ls -- --la' - **************************************************************************************************** - breeze [FLAGS] static-check -- + + breeze static-check [FLAGS] static_check [-- ] Run selected static checks for currently changed files. You should specify static check that you would like to run or 'all' to run all checks. One of: - all bat-tests check-apache-license check-executables-have-shebangs check-hooks-apply - check-merge-conflict check-xml debug-statements doctoc detect-private-key - end-of-file-fixer flake8 forbid-tabs insert-license lint-dockerfile - mixed-line-ending mypy setup-order shellcheck + all airflow-config-yaml base-operator bats-tests bats-in-container-tests build + check-apache-license check-builtin-literals check-executables-have-shebangs + check-hooks-apply check-integrations check-merge-conflict check-xml debug-statements + detect-private-key doctoc dont-use-safe-filter end-of-file-fixer fix-encoding-pragma + flake8 forbid-tabs helm-lint identity incorrect-use-of-LoggingMixin insert-license + language-matters lint-dockerfile lint-openapi markdownlint mermaid mixed-line-ending + mypy mypy-helm no-relative-imports pre-commit-descriptions pydevd python2-compile + python2-fastcheck python-no-log-warn rst-backticks setup-order setup-installation + shellcheck sort-in-the-wild trailing-whitespace update-breeze-file update-extras + update-local-yml-file yamllint You can pass extra arguments including options to to the pre-commit framework as passed after --. For example: 'breeze static-check mypy' or 'breeze static-check mypy -- --files tests/core.py' + 'breeze static-check mypy -- --all-files' + + To check all files that differ between you current branch and master run: + + 'breeze static-check all -- --from-ref $(git merge-base master HEAD) --to-ref HEAD' You can see all the options by adding --help EXTRA_ARG: 'breeze static-check mypy -- --help' - **************************************************************************************************** - breeze [FLAGS] static-check-all-files -- - Run selected static checks for all applicable files. You should specify static check that - you would like to run or 'all' to run all checks. One of: - all bat-tests check-apache-license check-executables-have-shebangs check-hooks-apply - check-merge-conflict check-xml debug-statements doctoc detect-private-key - end-of-file-fixer flake8 forbid-tabs insert-license lint-dockerfile - mixed-line-ending mypy setup-order shellcheck + #################################################################################################### - You can pass extra arguments including options to the pre-commit framework as - passed after --. For example: - 'breeze static-check-all-files mypy' or - 'breeze static-check-all-files mypy -- --verbose' + Detailed usage for command: tests - You can see all the options by adding --help EXTRA_ARG: - 'breeze static-check-all-files mypy -- --help' - **************************************************************************************************** - breeze [FLAGS] test-target -- + breeze tests [FLAGS] [TEST_TARGET ..] [-- ] Run the specified unit test target. There might be multiple targets specified separated with comas. The passed after -- are treated - as additional options passed to pytest. For example: + as additional options passed to pytest. You can pass 'tests' as target to + run all tests. For example: + + 'breeze tests tests/core/test_core.py -- --logging-level=DEBUG' + 'breeze tests tests + + Flags: + + --test-type TEST_TYPE + Type of the test to run. One of: + + All,Core,Integration,Heisentests,Postgres,MySQL,Helm + + Default: All + + + #################################################################################################### + + + Detailed usage for command: flags - 'breeze test-target tests/test_core.py -- --logging-level=DEBUG' - **************************************************************************************************** - breeze [FLAGS] flags -- Explains in detail all the flags that can be used with breeze. - **************************************************************************************************** - breeze [FLAGS] help -- - Shows this help message. - **************************************************************************************************** - breeze [FLAGS] help-all -- - Shows detailed help for all commands and flags. - **************************************************************************************************** #################################################################################################### - Flags + + Detailed usage for command: help + + + breeze help + + Shows general help message for all commands. + #################################################################################################### - **************************************************************************************************** - List of flags supported by breeze: + Detailed usage for command: help-all + + + breeze help-all + + Shows detailed help for all commands and flags. + + + #################################################################################################### + + + #################################################################################################### + + Summary of all flags supported by Breeze: **************************************************************************************************** Choose Airflow variant - **************************************************************************************************** - -p, --python + -p, --python PYTHON_MAJOR_MINOR_VERSION Python version used for the image. This is always major/minor version. + One of: - 2.7 3.5 3.6 3.7 + 2.7 3.5 3.6 3.7 3.8 + + **************************************************************************************************** + Choose backend to run for Airflow - -b, --backend + -b, --backend BACKEND Backend to use for tests - it determines which database is used. One of: @@ -842,107 +2105,109 @@ This is the current syntax for `./breeze <./breeze>`_: Default: sqlite + --postgres-version POSTGRES_VERSION + Postgres version used. One of: + + 9.6 10 11 12 13 + + --mysql-version MYSQL_VERSION + Mysql version used. One of: + + 5.6 5.7 + + **************************************************************************************************** + Enable production image + + -I, --production-image + Use production image for entering the environment and builds (not for tests). + + **************************************************************************************************** + Additional actions executed while entering breeze + -d, --db-reset - Resets the database at entry to the envvironment. It will drop all the tables + Resets the database at entry to the environment. It will drop all the tables and data and recreate the DB from scratch even if 'restart' command was not used. Combined with 'restart' command it enters the environment in the state that is - ready to start airflow webserver/scheduler/worker. Without the switch, the database + ready to start Airflow webserver/scheduler/worker. Without the switch, the database does not have any tables and you need to run reset db manually. - -i, --integration + -i, --integration INTEGRATION Integration to start during tests - it determines which integrations are started for integration tests. There can be more than one integration started, or all to start all integrations. Selected integrations are not saved for future execution. One of: - cassandra kerberos mongo openldap rabbitmq redis all + cassandra kerberos mongo openldap presto rabbitmq redis all - -I, --production-image - Use production image for entering the environment and builds (not for tests). + --init-script INIT_SCRIPT_FILE + Initialization script name - Sourced from files/airflow-breeze-config. Default value + init.sh. It will be executed after the environment is configured and started. **************************************************************************************************** - Manage Kind kubernetes cluster (optional) - **************************************************************************************************** + Additional actions executed while starting Airflow + --load-example-dags + Include Airflow example dags. - Acion for the cluster : only one of the --kind-cluster-* flags can be used at a time: + --load-default-connections + Include Airflow Default Connections. - -s, --kind-cluster-start - Starts kind Kubernetes cluster after entering the environment. The cluster is started using - Kubernetes Mode selected and Kubernetes version specified via --kubernetes-mode and - --kubernetes-version flags. - - -x, --kind-cluster-stop - Stops kind Kubernetes cluster if one has already been created. By default, if you do not - stop environment, the Kubernetes cluster created for testing is continuously running and - when you start Kubernetes testing again it will be reused. You can force deletion and - recreation of such cluster with this flag. + **************************************************************************************************** + Cleanup options when stopping Airflow - -r, --kind-cluster-recreate + --preserve-volumes + Use this flag if you would like to preserve data volumes from the databases used + by the integrations. By default, those volumes are deleted, so when you run 'stop' + or 'restart' commands you start from scratch, but by using this flag you can + preserve them. If you want to delete those volumes after stopping Breeze, just + run the 'breeze stop' again without this flag. - Recreates kind Kubernetes cluster if one has already been created. By default, if you do - not stop environment, the Kubernetes cluster created for testing is continuously running - and when you start Kubernetes testing again it will be reused. You can force deletion and - recreation of such cluster with this flag. + **************************************************************************************************** + Kind kubernetes and Kubernetes tests configuration(optional) - Kubernetes mode/version flags: + Configuration for the KinD Kubernetes cluster and tests: - -K, --kubernetes-mode - Kubernetes mode - only used in case one of --kind-cluster-* commands is used. + -K, --kubernetes-mode KUBERNETES_MODE + Kubernetes mode - only used in case one of kind-cluster commands is used. One of: - persistent_mode git_mode + image - Default: git_mode + Default: image - -V, --kubernetes-version - Kubernetes version - only used in case one of --kind-cluster-* commands is used. + -V, --kubernetes-version KUBERNETES_VERSION + Kubernetes version - only used in case one of kind-cluster commands is used. One of: - v1.15.3 v1.16.2 - - Default: v1.15.3 + v1.18.6 v1.17.5 v1.16.9 - **************************************************************************************************** - Manage mounting local files - **************************************************************************************************** + Default: v1.18.6 - -l, --skip-mounting-local-sources - Skips mounting local volume with sources - you get exactly what is in the - docker image rather than your current local sources of airflow. + --kind-version KIND_VERSION + Kind version - only used in case one of kind-cluster commands is used. + One of: - **************************************************************************************************** - Install Airflow if different than current - **************************************************************************************************** + v0.8.0 - -a, --install-airflow-version - If specified, installs airflow directly from PIP released version. One of: + Default: v0.8.0 - 1.10.9 1.10.8 1.10.7 1.10.6 1.10.5 1.10.4 1.10.3 1.10.2 master v1-10-test + --helm-version HELM_VERSION + Helm version - only used in case one of kind-cluster commands is used. + One of: - -t, --install-airflow-reference - Only for production image - if specified, installs airflow directly from reference in GitHub + v3.2.4 + Default: v3.2.4 **************************************************************************************************** - Database versions - **************************************************************************************************** - - --postgres-version - Postgres version used. One of: - - 9.6 10 - - - --mysql-version - Mysql version used. One of: - - 5.6 5.7 + Manage mounting local files + -l, --skip-mounting-local-sources + Skips mounting local volume with sources - you get exactly what is in the + docker image rather than your current local sources of Airflow. **************************************************************************************************** Assume answers to questions - **************************************************************************************************** -y, --assume-yes Assume 'yes' answer to all questions. @@ -954,25 +2219,41 @@ This is the current syntax for `./breeze <./breeze>`_: Assume 'quit' answer to all questions. **************************************************************************************************** - Credentials + Choose different Airflow version to install or run + + -a, --install-airflow-version INSTALL_AIRFLOW_VERSION + If specified, installs Airflow directly from PIP released version. This happens at + image building time in production image and at container entering time for CI image. One of: + + 1.10.15 1.10.14 1.10.12 1.10.11 1.10.10 1.10.9 none wheel sdist + + When 'none' is used, you can install airflow from local packages. When building image, + airflow package should be added to 'docker-context-files' and + --install-from-docker-context-files flag should be used. When running an image, airflow + package should be added to dist folder and --install-packages-from-dist flag should be used. + + -t, --install-airflow-reference INSTALL_AIRFLOW_REFERENCE + If specified, installs Airflow directly from reference in GitHub. This happens at + image building time in production image and at container entering time for CI image. + This can be a GitHub branch like master or v1-10-test, or a tag like 2.0.0a1. + + --no-rbac-ui + Disables RBAC UI when Airflow 1.10.* is installed. + + --install-packages-from-dist + If specified it will look for packages placed in dist folder and it will install the + packages after installing Airflow. This is useful for testing provider + packages. + **************************************************************************************************** + Credentials -f, --forward-credentials Forwards host credentials to docker container. Use with care as it will make your credentials available to everything you install in Docker. **************************************************************************************************** - Increase verbosity of the script - **************************************************************************************************** - - -v, --verbose - Show verbose information about executed commands (enabled by default for running test). - Note that you can further increase verbosity and see all the commands executed by breeze - by running 'export VERBOSE_COMMANDS="true"' before running breeze. - - **************************************************************************************************** - Flags for building the docker images - **************************************************************************************************** + Flags for building Docker images (both CI and production) -F, --force-build-images Forces building of the local docker images. The images are rebuilt @@ -984,7 +2265,9 @@ This is the current syntax for `./breeze <./breeze>`_: images are pulled by default only for the first time you run the environment, later the locally build images are used as cache. - -E, --extras + Customization options: + + -E, --extras EXTRAS Extras to pass to build images The default are different for CI and production images: CI image: @@ -994,83 +2277,175 @@ This is the current syntax for `./breeze <./breeze>`_: async,aws,azure,celery,dask,elasticsearch,gcp,kubernetes,mysql,postgres,redis,slack, ssh,statsd,virtualenv + --image-tag TAG + Additional tag in the image. + + --disable-pypi-when-building + Disable installing Airflow from pypi when building. If you use this flag and want + to install Airflow, you have to install it from packages placed in + 'docker-context-files' and use --install-from-local-files-when-building flag. + + --additional-extras ADDITIONAL_EXTRAS + Additional extras to pass to build images The default is no additional extras. + + --additional-python-deps ADDITIONAL_PYTHON_DEPS + Additional python dependencies to use when building the images. + + --dev-apt-command DEV_APT_COMMAND + The basic command executed before dev apt deps are installed. + + --additional-dev-apt-command ADDITIONAL_DEV_APT_COMMAND + Additional command executed before dev apt deps are installed. + + --additional-dev-apt-deps ADDITIONAL_DEV_APT_DEPS + Additional apt dev dependencies to use when building the images. + + --dev-apt-deps DEV_APT_DEPS + The basic apt dev dependencies to use when building the images. + + --additional-dev-apt-deps ADDITIONAL_DEV_DEPS + Additional apt dev dependencies to use when building the images. + + --additional-dev-apt-envs ADDITIONAL_DEV_APT_ENVS + Additional environment variables set when adding dev dependencies. + + --runtime-apt-command RUNTIME_APT_COMMAND + The basic command executed before runtime apt deps are installed. + + --additional-runtime-apt-command ADDITIONAL_RUNTIME_APT_COMMAND + Additional command executed before runtime apt deps are installed. + + --runtime-apt-deps ADDITIONAL_RUNTIME_APT_DEPS + The basic apt runtime dependencies to use when building the images. + + --additional-runtime-apt-deps ADDITIONAL_RUNTIME_DEPS + Additional apt runtime dependencies to use when building the images. + + --additional-runtime-apt-envs ADDITIONAL_RUNTIME_APT_DEPS + Additional environment variables set when adding runtime dependencies. + + Build options: + + --disable-mysql-client-installation + Disables installation of the mysql client which might be problematic if you are building + image in controlled environment. Only valid for production image. + + --constraints-location + Url to the constraints file. In case of the production image it can also be a path to the + constraint file placed in 'docker-context-files' folder, in which case it has to be + in the form of '/docker-context-files/' + + --disable-pip-cache + Disables GitHub PIP cache during the build. Useful if GitHub is not reachable during build. + + --install-from-local-files-when-building + This flag is used during image building. If it is used additionally to installing + Airflow from PyPI, the packages are installed from the .whl and .tar.gz packages placed + in the 'docker-context-files' folder. The same flag can be used during entering the image in + the CI image - in this case also the .whl and .tar.gz files will be installed automatically + -C, --force-clean-images Force build images with cache disabled. This will remove the pulled or build images and start building images from scratch. This might take a long time. - -L, --use-local-cache + -r, --skip-rebuild-check + Skips checking image for rebuilds. It will use whatever image is available locally/pulled. + + -L, --build-cache-local Uses local cache to build images. No pulled images will be used, but results of local - builds in the Docker cache are used instead. + builds in the Docker cache are used instead. This will take longer than when the pulled + cache is used for the first time, but subsequent '--build-cache-local' builds will be + faster as they will use mostly the locally build cache. - **************************************************************************************************** - Flags for pushing the docker images - **************************************************************************************************** + This is default strategy used by the Production image builds. - -u, --push-images - After building - uploads the images to DockerHub - It is useful in case you use your own DockerHub user to store images and you want - to build them locally. Note that you need to use 'docker login' before you upload images. + -U, --build-cache-pulled + Uses images pulled from registry (either DockerHub or GitHub depending on + --github-registry flag) to build images. The pulled images will be used as cache. + Those builds are usually faster than when ''--build-cache-local'' with the exception if + the registry images are not yet updated. The DockerHub images are updated nightly and the + GitHub images are updated after merges to master so it might be that the images are still + outdated vs. the latest version of the Dockerfiles you are using. In this case, the + ''--build-cache-local'' might be faster, especially if you iterate and change the + Dockerfiles yourself. + + This is default strategy used by the CI image builds. + + -X, --build-cache-disabled + Disables cache during docker builds. This is useful if you want to make sure you want to + rebuild everything from scratch. + + This strategy is used by default for both Production and CI images for the scheduled + (nightly) builds in CI. **************************************************************************************************** - User and repo used to login to github registry - **************************************************************************************************** + Flags for pulling/pushing Docker images (both CI and production) - -D, --dockerhub-user + -D, --dockerhub-user DOCKERHUB_USER DockerHub user used to pull, push and build images. Default: apache. - -H, --dockerhub-repo + -H, --dockerhub-repo DOCKERHUB_REPO DockerHub repository used to pull, push, build images. Default: airflow. - **************************************************************************************************** + -c, --github-registry GITHUB_REGISTRY + If GitHub registry is enabled, pulls and pushes are done from the GitHub registry not + DockerHub. You need to be logged in to the registry in order to be able to pull/push from + and you need to be committer to push to Apache Airflow' GitHub registry. - .. END BREEZE HELP MARKER + -g, --github-repository GITHUB_REPOSITORY + GitHub repository used to pull, push images when cache is used. + Default: apache/airflow. -Convenience Scripts -------------------- + If you use this flag, automatically --github-registry flag is enabled. -Once you run ``./breeze`` you can also execute various actions via generated convenience scripts: + -s, --github-image-id COMMIT_SHA|RUN_ID + or of the image. Images in GitHub registry are stored with those + to be able to easily find the image for particular CI runs. Once you know the + or , you can specify it in github-image-id flag and Breeze will + automatically pull and use that image so that you can easily reproduce a problem + that occurred in CI. -.. code-block:: + If you use this flag, automatically --github-registry is enabled. - Enter the environment : ./.build/cmd_run - Run command in the environment : ./.build/cmd_run "[command with args]" [bash options] - Run tests in the environment : ./.build/test_run [test-target] [pytest options] - Run Docker compose command : ./.build/dc [help/pull/...] [docker-compose options] -Troubleshooting -=============== + Default: latest. -If you are having problems with the Breeze environment, try the steps below. After each step you -can check whether your problem is fixed. + **************************************************************************************************** + Flags for running tests -1. If you are on macOS, check if you have enough disk space for Docker. -2. Restart Breeze with ``./breeze restart``. -3. Delete the ``.build`` directory and run ``./breeze build-image --force-pull-images``. -4. Clean up Docker images via ``breeze cleanup-image`` command. -5. Restart your Docker Engine and try again. -6. Restart your machine and try again. -7. Re-install Docker CE and try again. + --test-type TEST_TYPE + Type of the test to run. One of: -In case the problems are not solved, you can set the VERBOSE_COMMANDS variable to "true": + All,Core,Integration,Heisentests,Postgres,MySQL,Helm -.. code-block:: + Default: All - export VERBOSE_COMMANDS="true" + **************************************************************************************************** + Flags for generation of the packages + -S, --version-suffix-for-pypi SUFFIX + Adds optional suffix to the version in the generated backport package. It can be used + to generate rc1/rc2 ... versions of the packages to be uploaded to PyPI. -Then run the failed command, copy-and-paste the output from your terminal to the -`Airflow Slack `_ #airflow-breeze channel and -describe your problem. + -N, --version-suffix-for-svn SUFFIX + Adds optional suffix to the generated names of package. It can be used to generate + rc1/rc2 ... versions of the packages to be uploaded to SVN. -Fixing File/Directory Ownership -------------------------------- + **************************************************************************************************** + Increase verbosity of the scripts -On Linux there is a problem with propagating ownership of created files (a known Docker problem). Basically, -files and directories created in the container are not owned by the host user (but by the root user in our -case). This may prevent you from switching branches, for example, if files owned by the root user are -created within your sources. In case you are on a Linux host and have some files in your sources created -y the root user, you can fix the ownership of those files by running this script: + -v, --verbose + Show verbose information about executed docker, kind, kubectl, helm commands. Useful for + debugging - when you run breeze with --verbose flags you will be able to see the commands + executed under the hood and copy&paste them to your terminal to debug them more easily. -.. code-block:: + Note that you can further increase verbosity and see all the commands executed by breeze + by running 'export VERBOSE_COMMANDS="true"' before running breeze. - ./scripts/ci/ci_fix_ownership.sh + **************************************************************************************************** + Print detailed help message + + -h, --help + Shows detailed help message for the command specified. + + .. END BREEZE HELP MARKER diff --git a/CHANGELOG.txt b/CHANGELOG.txt index a8aa3533bafa4..50817d0373264 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,469 @@ +Airflow 1.10.15, 2021-03-16 +---------------------------- + +Bug Fixes +""""""""" + +- Fix ``airflow db upgrade`` to upgrade db as intended (#13267) +- Moved boto3 limitation to snowflake (#13286) +- ``KubernetesExecutor`` should accept images from ``executor_config`` (#13074) +- Scheduler should acknowledge active runs properly (#13803) +- Bugfix: Unable to import Airflow plugins on Python 3.8 (#12859) +- Include ``airflow/contrib/executors`` in the dist package +- Pin Click version for Python 2.7 users +- Ensure all statsd timers use millisecond values. (#10633) +- [``kubernetes_generate_dag_yaml``] - Fix dag yaml generate function (#13816) +- Fix `airflow tasks clear` cli command wirh `--yes` (#14188) +- Fix permission error on non-POSIX filesystem (#13121) (#14383) +- Fixed deprecation message for "variables" command (#14457) +- BugFix: fix the ``delete_dag`` function of json_client (#14441) +- Fix merging of secrets and configmaps for ``KubernetesExecutor`` (#14090) +- Fix webserver exiting when gunicorn master crashes (#13470) +- Bump ini from 1.3.5 to 1.3.8 in ``airflow/www_rbac`` +- Bump datatables.net from 1.10.21 to 1.10.23 in ``airflow/www_rbac`` +- Webserver: Sanitize string passed to origin param (#14738) +- Make ``rbac_app``'s ``db.session`` use the same timezone with ``@provide_session`` (#14025) + +Improvements +"""""""""""" + +- Adds airflow as viable docker command in official image (#12878) +- ``StreamLogWriter``: Provide (no-op) close method (#10885) +- Add 'airflow variables list' command for 1.10.x transition version (#14462) + +Doc only changes +"""""""""""""""" + +- Update URL for Airflow docs (#13561) +- Clarifies version args for installing 1.10 in Docker (#12875) + +Airflow 1.10.14, 2020-12-10 +---------------------------- + +Bug Fixes +""""""""" + +- BugFix: Tasks with ``depends_on_past`` or ``task_concurrency`` are stuck (#12663) +- Fix issue with empty Resources in executor_config (#12633) +- Fix: Deprecated config ``force_log_out_after`` was not used (#12661) +- Fix empty asctime field in JSON formatted logs (#10515) +- [AIRFLOW-2809] Fix security issue regarding Flask SECRET_KEY (#3651) +- [AIRFLOW-2884] Fix Flask SECRET_KEY security issue in www_rbac (#3729) +- [AIRFLOW-2886] Generate random Flask SECRET_KEY in default config (#3738) +- Add missing comma in setup.py (#12790) +- Bugfix: Unable to import Airflow plugins on Python 3.8 (#12859) +- Fix setup.py missing comma in ``setup_requires`` (#12880) +- Don't emit first_task_scheduling_delay metric for only-once dags (#12835) + +Improvements +"""""""""""" + +- Update setup.py to get non-conflicting set of dependencies (#12636) +- Rename ``[scheduler] max_threads`` to ``[scheduler] parsing_processes`` (#12605) +- Add metric for scheduling delay between first run task & expected start time (#9544) +- Add new-style 2.0 command names for Airflow 1.10.x (#12725) +- Add Kubernetes cleanup-pods CLI command for Helm Chart (#11802) +- Don't let webserver run with dangerous config (#12747) +- Replace pkg_resources with importlib.metadata to avoid VersionConflict errors (#12694) + +Doc only changes +"""""""""""""""" + +- Clarified information about supported Databases + + +Airflow 1.10.13, 2020-11-24 +---------------------------- + +New Features +"""""""""""" + +- Add "already checked" to failed pods in K8sPodOperator (#11368) +- Pass SQLAlchemy engine options to FAB based UI (#11395) +- [AIRFLOW-4438] Add Gzip compression to S3_hook (#8571) +- Add permission "extra_links" for Viewer role and above (#10719) +- Add generate_yaml command to easily test KubernetesExecutor before deploying pods (#10677) +- Add Secrets backend for Microsoft Azure Key Vault (#10898) + +Bug Fixes +""""""""" + +- SkipMixin: Handle empty branches (#11120) +- [AIRFLOW-5274] dag loading duration metric name too long (#5890) +- Handle no Dagrun in DagrunIdDep (#8389) (#11343) +- Fix Kubernetes Executor logs for long dag names (#10942) +- Add on_kill support for the KubernetesPodOperator (#10666) +- KubernetesPodOperator template fix (#10963) +- Fix displaying of add serialized_dag table migration +- Fix Start Date tooltip on DAGs page (#10637) +- URL encode execution date in the Last Run link (#10595) +- Fixes issue with affinity backcompat in Airflow 1.10 +- Fix KubernetesExecutor import in views.py +- Fix issues with Gantt View (#12419) +- Fix Entrypoint and _CMD config variables (#12411) +- Fix operator field update for SerializedBaseOperator (#10924) +- Limited cryptography to < 3.2 for python 2.7 +- Install cattr on Python 3.7 - Fix docs build on RTD (#12045) +- Limit version of marshmallow-sqlalchemy +- Pin `kubernetes` to a max version of 11.0.0 (#11974) +- Use snakebite-py3 for HDFS dependency for Python3 (#12340) +- Removes snakebite kerberos dependency (#10865) +- Fix failing dependencies for FAB and Celery (#10828) +- Fix pod_mutation_hook for 1.10.13 (#10850) +- Fix formatting of Host information +- Fix Logout Google Auth issue in Non-RBAC UI (#11890) +- Add missing imports to app.py (#10650) +- Show Generic Error for Charts & Query View in old UI (#12495) +- TimeSensor should respect the default_timezone config (#9699) +- TimeSensor should respect DAG timezone (#9882) +- Unify user session lifetime configuration (#11970) +- Handle outdated webserver session timeout gracefully. (#12332) + + +Improvements +"""""""""""" + +- Add XCom.deserialize_value to Airflow 1.10.13 (#12328) +- Mount airflow.cfg to pod_template_file (#12311) +- All k8s object must comply with JSON Schema (#12003) +- Validate airflow chart values.yaml & values.schema.json (#11990) +- Pod template file uses custom custom env variable (#11480) +- Bump attrs and cattrs dependencies (#11969) +- Bump attrs to > 20.0 (#11799) +- [AIRFLOW-3607] Only query DB once per DAG run for TriggerRuleDep (#4751) +- Rename task with duplicate task_id +- Manage Flask AppBuilder Tables using Alembic Migrations (#12352) +- ``airflow test`` only works for tasks in 1.10, not whole dags (#11191) +- Improve warning messaging for duplicate task_ids in a DAG (#11126) +- Pins moto to 1.3.14 (#10986) +- DbApiHook: Support kwargs in get_pandas_df (#9730) +- Make grace_period_seconds option on K8sPodOperator (#10727) +- Fix syntax error in Dockerfile 'maintainer' Label (#10899) +- The entrypoints in Docker Image should be owned by Airflow (#10853) +- Make dockerfiles Google Shell Guide Compliant (#10734) +- clean-logs script for Dockerfile: trim logs before sleep (#10685) +- When sending tasks to celery from a sub-process, reset signal handlers (#11278) +- SkipMixin: Add missing session.commit() and test (#10421) +- Webserver: Further Sanitize values passed to origin param (#12459) +- Security upgrade lodash from 4.17.19 to 4.17.20 (#11095) +- Log instead of raise an Error for unregistered OperatorLinks (#11959) +- Mask Password in Log table when using the CLI (#11468) +- [AIRFLOW-3607] Optimize dep checking when depends on past set and concurrency limit +- Execute job cancel HTTPRequest in Dataproc Hook (#10361) +- Use rst lexer to format airflow upgrade check output (#11259) +- Remove deprecation warning from contrib/kubernetes/pod.py +- adding body as templated field for CloudSqlImportOperator (#10510) +- Change log level for User's session to DEBUG (#12414) + +Deprecations +"""""""""""" + +- Deprecate importing Hooks from plugin-created module (#12133) +- Deprecate adding Operators and Sensors via plugins (#12069) + +Doc only changes +"""""""""""""""" + +- [Doc] Correct description for macro task_instance_key_str (#11062) +- Checks if all the libraries in setup.py are listed in installation.rst file (#12023) +- Revise "Project Focus" copy (#12011) +- Move Project focus and Principles higher in the README (#11973) +- Remove archived link from README.md (#11945) +- Update download url for Airflow Version (#11800) +- Add Project URLs for PyPI page (#11801) +- Move Backport Providers docs to our docsite (#11136) +- Refactor rebase copy (#11030) +- Add missing images for kubernetes executor docs (#11083) +- Fix identation in executor_config example (#10467) +- Enhanced the Kubernetes Executor doc (#10433) +- Refactor content to a markdown table (#10863) +- Rename "Beyond the Horizon" section and refactor content (#10802) +- Refactor official source section to use bullets (#10801) +- Add section for official source code (#10678) +- Add redbubble link to Airflow merchandise (#10359) +- README Doc: Link to Airflow directory in ASF Directory (#11137) +- Fix the default value for VaultBackend's config_path (#12518) + +Airflow 1.10.12, 2020-08-25 +---------------------------- + +New Features +"""""""""""" + +- Add DateTimeSensor (#9697) +- Add ClusterPolicyViolation support to airflow local settings (#10282) +- Get Airflow configs with sensitive data from Secret Backends (#9645) +- [AIRFLOW-4734] Upsert functionality for PostgresHook.insert_rows() (#8625) +- Allow defining custom XCom class (#8560) + +Bug Fixes +""""""""" + +- Add pre 1.10.11 Kubernetes Paths back with Deprecation Warning (#10067) +- Fixes PodMutationHook for backwards compatibility (#9903) +- Fix bug in executor_config when defining resources (#9935) +- Respect DAG Serialization setting when running sync_perm (#10321) +- Show correct duration on graph view for running task (#8311) (#8675) +- Fix regression in SQLThresholdCheckOperator (#9312) +- [AIRFLOW-6931] Fixed migrations to find all dependencies for MSSQL (#9891) +- Avoid sharing session with RenderedTaskInstanceFields write and delete (#9993) +- Fix clear future recursive when ExternalTaskMarker is used (#9515) +- Handle IntegrityError while creating TIs (#10136) +- Fix airflow-webserver startup errors when using Kerberos Auth (#10047) +- Fixes treatment of open slots in scheduler (#9316) (#9505) +- Fix KubernetesPodOperator reattachment (#10230) +- Fix more PodMutationHook issues for backwards compatibility (#10084) +- [AIRFLOW-5391] Do not re-run skipped tasks when they are cleared (#7276) +- Fix task_instance_mutation_hook (#9910) +- Fixes failing formatting of DAG file containing {} in docstring (#9779) +- Fix is_terminal_support_colors function (#9734) +- Fix PythonVirtualenvOperator when using ``provide_context=True`` (#8256) +- Fix issue with mounting volumes from secrets (#10366) +- BugFix: K8s Executor Multinamespace mode is evaluated to true by default (#10410) +- Make KubernetesExecutor recognize kubernetes_labels (#10412) +- Fix broken Kubernetes PodRuntimeInfoEnv (#10478) +- Sync FAB Permissions for all base views (#12162) + +Improvements +"""""""""""" + +- Use Hash of Serialized DAG to determine DAG is changed or not (#10227) +- Update Serialized DAGs in Webserver when DAGs are Updated (#9851) +- Do not Update Serialized DAGs in DB if DAG did not change (#9850) +- Add __repr__ to SerializedDagModel (#9862) +- Update JS packages to latest versions (#9811) (#9921) +- UI Graph View: Focus upstream / downstream task dependencies on mouseover (#9303) +- Allow ``image`` in ``KubernetesPodOperator`` to be templated (#10068) +- [AIRFLOW-6843] Add delete_option_kwargs to delete_namespaced_pod (#7523) +- Improve process terminating in scheduler_job (#8064) +- Replace deprecated base classes used in bigquery_check_operator (#10272) +- [AIRFLOW-5897] Allow setting -1 as pool slots value in webserver (#6550) +- Limit all google-cloud api to <2.0.0 (#10317) +- [AIRFLOW-6706] Lazy load operator extra links (#7327) (#10318) +- Add Snowflake support to SQL operator and sensor (#9843) +- Makes multi-namespace mode optional (#9570) +- Pin Pyarrow < 1.0 +- Pin pymongo version to <3.11.0 +- Pin google-cloud-container to <2 (#9901) +- Dockerfile: Remove package.json and yarn.lock from the prod image (#9814) +- Dockerfile: The group of embedded DAGs should be root to be OpenShift compatible (#9794) +- Update upper limit of flask-swagger, gunicorn & jinja2 (#9684) +- Webserver: Sanitize values passed to origin param (#10334) +- Sort connection type list in add/edit page alphabetically (#8692) + +Doc only changes +"""""""""""""""" + +- Add new committers: Ry Walker & Leah Cole to project.rst (#9892) +- Add Qingping Hou to committers list (#9725) +- Updated link to official documentation (#9629) +- Create a short-link for Airflow Slack Invites (#10034) +- Fix docstrings in BigQueryGetDataOperator (#10042) +- Set language on code-block on docs/howto/email-config.rst (#10238) +- Remove duplicate line from 1.10.10 CHANGELOG (#10289) +- Improve heading on Email Configuration page (#10175) +- Fix link for the Jinja Project in docs/tutorial.rst (#10245) +- Create separate section for Cron Presets (#10247) +- Add Syntax Highlights to code-blocks in docs/best-practices.rst (#10258) +- Fix docstrings in BigQueryGetDataOperator (#10042) +- Fix typo in Task Lifecycle section (#9867) +- Make Secret Backend docs clearer about Variable & Connection View (#8913) + +Airflow 1.10.11, 2020-07-10 +----------------------------- + +New Features +"""""""""""" + +- Add task instance mutation hook (#8852) +- Allow changing Task States Colors (#9520) +- Add support for AWS Secrets Manager as Secrets Backend (#8186) +- Add airflow info command to the CLI (#8704) +- Add Local Filesystem Secret Backend (#8596) +- Add Airflow config CLI command (#8694) +- Add Support for Python 3.8 (#8836)(#8823) +- Allow K8S worker pod to be configured from JSON/YAML file (#6230) +- Add quarterly to crontab presets (#6873) +- Add support for ephemeral storage on KubernetesPodOperator (#6337) +- Add AirflowFailException to fail without any retry (#7133) +- Add SQL Branch Operator (#8942) + +Bug Fixes +""""""""" + +- Use NULL as dag.description default value (#7593) +- BugFix: DAG trigger via UI error in RBAC UI (#8411) +- Fix logging issue when running tasks (#9363) +- Fix JSON encoding error in DockerOperator (#8287) +- Fix alembic crash due to typing import (#6547) +- Correctly restore upstream_task_ids when deserializing Operators (#8775) +- Correctly store non-default Nones in serialized tasks/dags (#8772) +- Correctly deserialize dagrun_timeout field on DAGs (#8735) +- Fix tree view if config contains " (#9250) +- Fix Dag Run UI execution date with timezone cannot be saved issue (#8902) +- Fix Migration for MSSQL (#8385) +- RBAC ui: Fix missing Y-axis labels with units in plots (#8252) +- RBAC ui: Fix missing task runs being rendered as circles instead (#8253) +- Fix: DagRuns page renders the state column with artifacts in old UI (#9612) +- Fix task and dag stats on home page (#8865) +- Fix the trigger_dag api in the case of nested subdags (#8081) +- UX Fix: Prevent undesired text selection with DAG title selection in Chrome (#8912) +- Fix connection add/edit for spark (#8685) +- Fix retries causing constraint violation on MySQL with DAG Serialization (#9336) +- [AIRFLOW-4472] Use json.dumps/loads for templating lineage data (#5253) +- Restrict google-cloud-texttospeach to committer (#7392) - [AIRFLOW-XXXX] Remove duplicated paragraph in docs (#7662) - Fix reference to KubernetesPodOperator (#8100) +- Update the tree view of dag on Concepts Last Run Only (#8268) Airflow 1.10.9, 2020-02-07 @@ -1515,7 +1981,7 @@ Improvements - [AIRFLOW-3034]: Readme updates : Add Slack & Twitter, remove Gitter - [AIRFLOW-3028] Update Text & Images in Readme.md - [AIRFLOW-208] Add badge to show supported Python versions (#3839) -- [AIRFLOW-2238] Update PR tool to push directly to Github +- [AIRFLOW-2238] Update PR tool to push directly to GitHub - [AIRFLOW-2238] Flake8 fixes on dev/airflow-pr - [AIRFLOW-2238] Update PR tool to remove outdated info (#3978) - [AIRFLOW-3005] Replace 'Airbnb Airflow' with 'Apache Airflow' (#3845) @@ -3591,7 +4057,7 @@ Airflow 1.7.1, 2016-05-19 - Update plugins.rst for clarity on the example (#1309) - Fix s3 logging issue - Add twitter feed example dag -- Github ISSUE_TEMPLATE & PR_TEMPLATE cleanup +- GitHub ISSUE_TEMPLATE & PR_TEMPLATE cleanup - Reduce logger verbosity - Adding a PR Template - Add Lucid to list of users diff --git a/CI.rst b/CI.rst new file mode 100644 index 0000000000000..0ac1c9d9f4ad4 --- /dev/null +++ b/CI.rst @@ -0,0 +1,786 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. contents:: :local: + +CI Environment +============== + +Continuous Integration is important component of making Apache Airflow robust and stable. We are running +a lot of tests for every pull request, for master and v1-10-test branches and regularly as CRON jobs. + +Our execution environment for CI is `GitHub Actions `_. GitHub Actions +(GA) are very well integrated with GitHub code and Workflow and it has evolved fast in 2019/202 to become +a fully-fledged CI environment, easy to use and develop for, so we decided to switch to it. Our previous +CI system was Travis CI. + +However part of the philosophy we have is that we are not tightly coupled with any of the CI +environments we use. Most of our CI jobs are written as bash scripts which are executed as steps in +the CI jobs. And we have a number of variables determine build behaviour. + + + + +GitHub Actions runs +------------------- + +Our builds on CI are highly optimized. They utilise some of the latest features provided by GitHub Actions +environment that make it possible to reuse parts of the build process across different Jobs. + +Big part of our CI runs use Container Images. Airflow has a lot of dependencies and in order to make +sure that we are running tests in a well configured and repeatable environment, most of the tests, +documentation building, and some more sophisticated static checks are run inside a docker container +environment. This environment consist of two types of images: CI images and PROD images. CI Images +are used for most of the tests and checks where PROD images are used in the Kubernetes tests. + +In order to run the tests, we need to make sure tha the images are built using latest sources and that it +is done quickly (full rebuild of such image from scratch might take ~15 minutes). Therefore optimisation +techniques have been implemented that use efficiently cache from the GitHub Docker registry - in most cases +this brings down the time needed to rebuild the image to ~4 minutes. In some cases (when dependencies change) +it can be ~6-7 minutes and in case base image of Python releases new patch-level, it can be ~12 minutes. + +Currently in master version of Airflow we run tests in 3 different versions of Python (3.6, 3.7, 3.8) +which means that we have to build 6 images (3 CI ones and 3 PROD ones). Yet we run around 12 jobs +with each of the CI images. That is a lot of time to just build the environment to run. Therefore +we are utilising ``workflow_run`` feature of GitHub Actions. This feature allows to run a separate, +independent workflow, when the main workflow is run - this separate workflow is different than the main +one, because by default it runs using ``master`` version of the sources but also - and most of all - that +it has WRITE access to the repository. This is especially important in our case where Pull Requests to +Airflow might come from any repository, and it would be a huge security issue if anyone from outside could +utilise the WRITE access to Apache Airflow repository via an external Pull Request. + +Thanks to the WRITE access and fact that the 'workflow_run' by default uses the 'master' version of the +sources, we can safely run some logic there will checkout the incoming Pull Request, build the container +image from the sources from the incoming PR and push such image to an GitHub Docker Registry - so that +this image can be built only once and used by all the jobs running tests. The image is tagged with unique +``RUN_ID`` of the incoming Pull Request and the tests run in the Pull Request can simply pull such image +rather than build it from the scratch. Pulling such image takes ~ 1 minute, thanks to that we are saving +a lot of precious time for jobs. + + +Local runs +---------- + +The main goal of the CI philosophy we have that no matter how complex the test and integration +infrastructure, as a developer you should be able to reproduce and re-run any of the failed checks +locally. One part of it are pre-commit checks, that allow you to run the same static checks in CI +and locally, but another part is the CI environment which is replicated locally with Breeze. + +You can read more about Breeze in `BREEZE.rst `_ but in essence it is a script that allows +you to re-create CI environment in your local development instance and interact with it. In its basic +form, when you do development you can run all the same tests that will be run in CI - but locally, +before you submit them as PR. Another use case where Breeze is useful is when tests fail on CI. You can +take the ``RUN_ID`` of failed build pass it as ``--github-image-id`` parameter of Breeze and it will +download the very same version of image that was used in CI and run it locally. This way, you can very +easily reproduce any failed test that happens in CI - even if you do not check out the sources +connected with the run. + +You can read more about it in `BREEZE.rst `_ and `TESTING.rst `_ + + +Difference between local runs and GitHub Action workflows +--------------------------------------------------------- + +Depending whether the scripts are run locally (most often via `Breeze `_) or whether they +are run in "CI Build" or "Build Image" workflows they can take different values. + +You can use those variables when you try to reproduce the build locally. + ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| Variable | Local | Build Image | Main CI | Comment | +| | development | CI workflow | Workflow | | ++=========================================+=============+=============+============+=================================================+ +| Basic variables | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``PYTHON_MAJOR_MINOR_VERSION`` | | | | Major/Minor version of python used. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``DB_RESET`` | false | true | true | Determines whether database should be reset | +| | | | | at the container entry. By default locally | +| | | | | the database is not reset, which allows to | +| | | | | keep the database content between runs in | +| | | | | case of Postgres or MySQL. However, | +| | | | | it requires to perform manual init/reset | +| | | | | if you stop the environment. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| Dockerhub variables | ++-----------------------------------------+----------------------------------------+-------------------------------------------------+ +| ``DOCKERHUB_USER`` | apache | Name of the DockerHub user to use | ++-----------------------------------------+----------------------------------------+-------------------------------------------------+ +| ``DOCKERHUB_REPO`` | airflow | Name of the DockerHub repository to use | ++-----------------------------------------+----------------------------------------+-------------------------------------------------+ +| Mount variables | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``MOUNT_LOCAL_SOURCES`` | true | false | false | Determines whether local sources are | +| | | | | mounted to inside the container. Useful for | +| | | | | local development, as changes you make | +| | | | | locally can be immediately tested in | +| | | | | the container. We mount only selected, | +| | | | | important folders. We do not mount the whole | +| | | | | project folder in order to avoid accidental | +| | | | | use of artifacts (such as ``egg-info`` | +| | | | | directories) generated locally on the | +| | | | | host during development. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| Force variables | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``FORCE_PULL_IMAGES`` | true | true | true | Determines if images are force-pulled, | +| | | | | no matter if they are already present | +| | | | | locally. This includes not only the | +| | | | | CI/PROD images but also the python base | +| | | | | images. Note that if python base images | +| | | | | change, also the CI and PROD images | +| | | | | need to be fully rebuild unless they were | +| | | | | already built with that base python | +| | | | | image. This is false for local development | +| | | | | to avoid often pulling and rebuilding | +| | | | | the image. It is true for CI workflow in | +| | | | | case waiting from images is enabled | +| | | | | as the images needs to be force-pulled from | +| | | | | GitHub Registry, but it is set to | +| | | | | false when waiting for images is disabled. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``FORCE_BUILD_IMAGES`` | false | false | false | Forces building images. This is generally not | +| | | | | very useful in CI as in CI environment image | +| | | | | is built or pulled only once, so there is no | +| | | | | need to set the variable to true. For local | +| | | | | builds it forces rebuild, regardless if it | +| | | | | is determined to be needed. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``FORCE_ANSWER_TO_QUESTIONS`` | | yes | yes | This variable determines if answer to questions | +| | | | | during the build process should be | +| | | | | automatically given. For local development, | +| | | | | the user is occasionally asked to provide | +| | | | | answers to questions such as - whether | +| | | | | the image should be rebuilt. By default | +| | | | | the user has to answer but in the CI | +| | | | | environment, we force "yes" answer. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``SKIP_CHECK_REMOTE_IMAGE`` | false | true | true | Determines whether we check if remote image | +| | | | | is "fresher" than the current image. | +| | | | | When doing local breeze runs we try to | +| | | | | determine if it will be faster to rebuild | +| | | | | the image or whether the image should be | +| | | | | pulled first from the cache because it has | +| | | | | been rebuilt. This is slightly experimental | +| | | | | feature and will be improved in the future | +| | | | | as the current mechanism does not always | +| | | | | work properly. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| Host variables | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``HOST_USER_ID`` | | | | User id of the host user. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``HOST_GROUP_ID`` | | | | Group id of the host user. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``HOST_OS`` | | Linux | Linux | OS of the Host (Darwin/Linux). | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``HOST_HOME`` | | | | Home directory on the host. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``HOST_AIRFLOW_SOURCES`` | | | | Directory where airflow sources are located | +| | | | | on the host. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| Image variables | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``INSTALL_AIRFLOW_VERSION`` | | | | Installs Airflow version from PyPI when | +| | | | | building image. Can be "none" to skip airflow | +| | | | | installation so that it can be installed from | +| | | | | locally prepared packages. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``INSTALL_AIRFLOW_REFERENCE`` | | | | Installs Airflow version from GitHub | +| | | | | branch or tag. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| Version suffix variables | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``VERSION_SUFFIX_FOR_PYPI`` | | | | Version suffix used during backport | +| | | | | package preparation for PyPI builds. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``VERSION_SUFFIX_FOR_SVN`` | | | | Version suffix used during backport | +| | | | | package preparation for SVN builds. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| Git variables | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| COMMIT_SHA | | GITHUB_SHA | GITHUB_SHA | SHA of the commit of the build is run | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| Verbosity variables | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``PRINT_INFO_FROM_SCRIPTS`` | true | true | true | Allows to print output to terminal from running | +| | (x) | (x) | (x) | scripts. It prints some extra outputs if true | +| | | | | including what the commands do, results of some | +| | | | | operations, summary of variable values, exit | +| | | | | status from the scripts, outputs of failing | +| | | | | commands. If verbose is on it also prints the | +| | | | | commands executed by docker, kind, helm, | +| | | | | kubectl. Disabled in pre-commit checks. | +| | | | | | +| | | | | (x) set to false in pre-commits | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``VERBOSE`` | false | true | true | Determines whether docker, helm, kind, | +| | | | | kubectl commands should be printed before | +| | | | | execution. This is useful to determine | +| | | | | what exact commands were executed for | +| | | | | debugging purpose as well as allows | +| | | | | to replicate those commands easily by | +| | | | | copy&pasting them from the output. | +| | | | | requires ``PRINT_INFO_FROM_SCRIPTS`` set to | +| | | | | true. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``VERBOSE_COMMANDS`` | false | false | false | Determines whether every command | +| | | | | executed in bash should also be printed | +| | | | | before execution. This is a low-level | +| | | | | debugging feature of bash (set -x) and | +| | | | | it should only be used if you are lost | +| | | | | at where the script failed. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| Image build variables | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``UPGRADE_TO_LATEST_CONSTRAINTS`` | false | false | false | Determines whether the build should | +| | | | (x) | attempt to upgrade all | +| | | | | PIP dependencies to latest ones matching | +| | | | | ``setup.py`` limits. This tries to replicate | +| | | | | the situation of "fresh" user who just installs | +| | | | | airflow and uses latest version of matching | +| | | | | dependencies. By default we are using a | +| | | | | tested set of dependency constraints | +| | | | | stored in separated "orphan" branches | +| | | | | of the airflow repository | +| | | | | ("constraints-master, "constraints-1-10") | +| | | | | but when this flag is set to anything but false | +| | | | | (for example commit SHA), they are not used | +| | | | | used and "eager" upgrade strategy is used | +| | | | | when installing dependencies. We set it | +| | | | | to true in case of direct pushes (merges) | +| | | | | to master and scheduled builds so that | +| | | | | the constraints are tested. In those builds, | +| | | | | in case we determine that the tests pass | +| | | | | we automatically push latest set of | +| | | | | "tested" constraints to the repository. | +| | | | | | +| | | | | Setting the value to commit SHA is best way | +| | | | | to assure that constraints are upgraded even if | +| | | | | there is no change to setup.py | +| | | | | | +| | | | | This way our constraints are automatically | +| | | | | tested and updated whenever new versions | +| | | | | of libraries are released. | +| | | | | | +| | | | | (x) true in case of direct pushes and | +| | | | | scheduled builds | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``CHECK_IMAGE_FOR_REBUILD`` | true | true | true | Determines whether attempt should be | +| | | | (x) | made to rebuild the CI image with latest | +| | | | | sources. It is true by default for | +| | | | | local builds, however it is set to | +| | | | | true in case we know that the image | +| | | | | we pulled or built already contains | +| | | | | the right sources. In such case we | +| | | | | should set it to false, especially | +| | | | | in case our local sources are not the | +| | | | | ones we intend to use (for example | +| | | | | when ``--github-image-id`` is used | +| | | | | in Breeze. | +| | | | | | +| | | | | In CI builds it is set to true | +| | | | | in case of the "Build Image" | +| | | | | workflow or when | +| | | | | waiting for images is disabled | +| | | | | in the CI workflow. | +| | | | | | +| | | | | (x) if waiting for images the variable is set | +| | | | | to false automatically. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ +| ``SKIP_BUILDING_PROD_IMAGE`` | false | false | false | Determines whether we should skip building | +| | | | (x) | the PROD image with latest sources. | +| | | | | It is set to false, but in deploy app for | +| | | | | kubernetes step it is set to "true", because at | +| | | | | this stage we know we have good image build or | +| | | | | pulled. | +| | | | | | +| | | | | (x) set to true in "Deploy App to Kubernetes" | +| | | | | to false automatically. | ++-----------------------------------------+-------------+-------------+------------+-------------------------------------------------+ + +Running CI Builds locally +========================= + +The following variables are automatically determined based on CI environment variables. +You can locally by setting ``CI="true"`` and run the ci scripts from the ``scripts/ci`` folder: + +* ``provider_packages`` - scripts to build and test provider packages +* ``constraints`` - scripts to build and publish latest set of valid constraints +* ``docs`` - scripts to build documentation +* ``images`` - scripts to build and push CI and PROD images +* ``kubernetes`` - scripts to setup kubernetes cluster, deploy airflow and run kubernetes tests with it +* ``testing`` - scripts that run unit and integration tests +* ``tools`` - scripts that perform various clean-up and preparation tasks + +Common libraries of functions for all the scripts can be found in ``libraries`` folder. + +For detailed use of those scripts you can refer to ``.github/workflows/`` - those scripts are used +by the CI workflows of ours. + +The default values are "sane" you can change them to interact with your own repositories or registries. +Note that you need to set "CI" variable to true in order to get the same results as in CI. + ++------------------------------+----------------------+-----------------------------------------------------+ +| Variable | Default | Comment | ++==============================+======================+=====================================================+ +| CI | ``false`` | If set to "true", we simulate behaviour of | +| | | all scripts as if they are in CI environment | ++------------------------------+----------------------+-----------------------------------------------------+ +| CI_TARGET_REPO | ``apache/airflow`` | Target repository for the CI build. Used to | +| | | compare incoming changes from PR with the target. | ++------------------------------+----------------------+-----------------------------------------------------+ +| CI_TARGET_BRANCH | ``master`` | Target branch where the PR should land. Used to | +| | | compare incoming changes from PR with the target. | ++------------------------------+----------------------+-----------------------------------------------------+ +| CI_BUILD_ID | ``0`` | Unique id of the build that is kept across re runs | +| | | (for GitHub actions it is ``GITHUB_RUN_ID``) | ++------------------------------+----------------------+-----------------------------------------------------+ +| CI_JOB_ID | ``0`` | Unique id of the job - used to produce unique | +| | | artifact names. | ++------------------------------+----------------------+-----------------------------------------------------+ +| CI_EVENT_TYPE | ``pull_request`` | Type of the event. It can be one of | +| | | [``pull_request``, ``pull_request_target``, | +| | | ``schedule``, ``push``] | ++------------------------------+----------------------+-----------------------------------------------------+ +| CI_REF | ``refs/head/master`` | Branch in the source repository that is used to | +| | | make the pull request. | ++------------------------------+----------------------+-----------------------------------------------------+ + + +GitHub Registry Variables +========================= + +Our CI uses GitHub Registry to pull and push images to/from by default. You can however make it interact with +DockerHub registry or change the GitHub registry to interact with and use your own repo by changing +``GITHUB_REPOSITORY`` and providing your own GitHub Username and Token. + ++--------------------------------+---------------------------+----------------------------------------------+ +| Variable | Default | Comment | ++================================+===========================+==============================================+ +| USE_GITHUB_REGISTRY | true | If set to "true", we interact with GitHub | +| | | Registry registry not the DockerHub one. | ++--------------------------------+---------------------------+----------------------------------------------+ +| GITHUB_REGISTRY | ``docker.pkg.github.com`` | DNS name of the GitHub registry to | +| | | use. | ++--------------------------------+---------------------------+----------------------------------------------+ +| GITHUB_REPOSITORY | ``apache/airflow`` | Prefix of the image. It indicates which. | +| | | registry from GitHub to use | ++--------------------------------+---------------------------+----------------------------------------------+ +| GITHUB_USERNAME | | Username to use to login to GitHub | +| | | | ++--------------------------------+---------------------------+----------------------------------------------+ +| GITHUB_TOKEN | | Personal token to use to login to GitHub | +| | | | ++--------------------------------+---------------------------+----------------------------------------------+ +| GITHUB_REGISTRY_WAIT_FOR_IMAGE | ``false`` | Wait for the image to be available. This is | +| | | useful if commit SHA is used as pull tag | ++--------------------------------+---------------------------+----------------------------------------------+ +| GITHUB_REGISTRY_PULL_IMAGE_TAG | ``latest`` | Pull this image tag. This is "latest" by | +| | | default, can be commit SHA or RUN_ID. | ++--------------------------------+---------------------------+----------------------------------------------+ +| GITHUB_REGISTRY_PUSH_IMAGE_TAG | ``latest`` | Pull this image tag. This is "latest" by | +| | | default, can be commit SHA or RUN_ID. | ++--------------------------------+---------------------------+----------------------------------------------+ + +Dockerhub Variables +=================== + +If ``USE_GITHUB_REGISTRY`` is set to "false" you can interact directly with DockerHub. By default +you pull from/push to "apache/airflow" DockerHub repository, but you can change +that to your own repository by setting those environment variables: + ++----------------+-------------+-----------------------------------+ +| Variable | Default | Comment | ++================+=============+===================================+ +| DOCKERHUB_USER | ``apache`` | Name of the DockerHub user to use | ++----------------+-------------+-----------------------------------+ +| DOCKERHUB_REPO | ``airflow`` | Name of the DockerHub repo to use | ++----------------+-------------+-----------------------------------+ + +CI Architecture +=============== + + .. This image is an export from the 'draw.io' graph available in + https://cwiki.apache.org/confluence/display/AIRFLOW/AIP-23+Migrate+out+of+Travis+CI + You can edit it there and re-export. + +.. image:: images/ci/CI.png + :align: center + :alt: CI architecture of Apache Airflow + +The following components are part of the CI infrastructure + +* **Apache Airflow Code Repository** - our code repository at https://github.com/apache/airflow +* **Apache Airflow Forks** - forks of the Apache Airflow Code Repository from which contributors make + Pull Requests +* **GitHub Actions** - (GA) UI + execution engine for our jobs +* **GA CRON trigger** - GitHub Actions CRON triggering our jobs +* **GA Workers** - virtual machines running our jobs at GitHub Actions (max 20 in parallel) +* **GitHub Private Image Registry**- image registry used as build cache for CI jobs. + It is at https://docker.pkg.github.com/apache/airflow/airflow +* **DockerHub Public Image Registry** - publicly available image registry at DockerHub. + It is at https://hub.docker.com/repository/docker/apache/airflow +* **DockerHub Build Workers** - virtual machines running build jibs at DockerHub +* **Official Images** (future) - these are official images that are prominently visible in DockerHub. + We aim our images to become official images so that you will be able to pull them + with ``docker pull apache-airflow`` + +CI run types +============ + +The following CI Job run types are currently run for Apache Airflow (run by ci.yaml workflow) +and each of the run types has different purpose and context. + +Pull request run +---------------- + +Those runs are results of PR from the forks made by contributors. Most builds for Apache Airflow fall +into this category. They are executed in the context of the "Fork", not main +Airflow Code Repository which means that they have only "read" permission to all the GitHub resources +(container registry, code repository). This is necessary as the code in those PRs (including CI job +definition) might be modified by people who are not committers for the Apache Airflow Code Repository. + +The main purpose of those jobs is to check if PR builds cleanly, if the test run properly and if +the PR is ready to review and merge. The runs are using cached images from the Private GitHub registry - +CI, Production Images as well as base Python images that are also cached in the Private GitHub registry. +Also for those builds we only execute Python tests if important files changed (so for example if it is +"no-code" change, no tests will be executed. + +The workflow involved in Pull Requests review and approval is a bit more complex than simple workflows +in most of other projects because we've implemented some optimizations related to efficient use +of queue slots we share with other Apache Software Foundation projects. More details about it +can be found in `PULL_REQUEST_WORKFLOW.rst `_. + + +Direct Push/Merge Run +--------------------- + +Those runs are results of direct pushes done by the committers or as result of merge of a Pull Request +by the committers. Those runs execute in the context of the Apache Airflow Code Repository and have also +write permission for GitHub resources (container registry, code repository). +The main purpose for the run is to check if the code after merge still holds all the assertions - like +whether it still builds, all tests are green. + +This is needed because some of the conflicting changes from multiple PRs might cause build and test failures +after merge even if they do not fail in isolation. Also those runs are already reviewed and confirmed by the +committers so they can be used to do some housekeeping: +- pushing most recent image build in the PR to the GitHub Private Registry (for caching) +- upgrading to latest constraints and pushing those constraints if all tests succeed +- refresh latest Python base images in case new patch-level is released + +The housekeeping is important - Python base images are refreshed with varying frequency (once every few months +usually but sometimes several times per week) with the latest security and bug fixes. +Those patch level images releases can occasionally break Airflow builds (specifically Docker image builds +based on those images) therefore in PRs we only use latest "good" python image that we store in the +private GitHub cache. The direct push/master builds are not using registry cache to pull the python images +- they are directly pulling the images from DockerHub, therefore they will try the latest images +after they are released and in case they are fine, CI Docker image is build and tests are passing - +those jobs will push the base images to the private GitHub Registry so that they be used by subsequent +PR runs. + +Scheduled runs +-------------- + +Those runs are results of (nightly) triggered job - only for ``master`` branch. The +main purpose of the job is to check if there was no impact of external dependency changes on the Apache +Airflow code (for example transitive dependencies released that fail the build). It also checks if the +Docker images can be build from the scratch (again - to see if some dependencies have not changed - for +example downloaded package releases etc. Another reason for the nightly build is that the builds tags most +recent master with ``nightly-master`` tag so that DockerHub build can pick up the moved tag and prepare a +nightly public master build in the DockerHub registry. The ``v1-10-test`` branch images are build in +DockerHub when pushing ``v1-10-stable`` manually. + +All runs consist of the same jobs, but the jobs behave slightly differently or they are skipped in different +run categories. Here is a summary of the run categories with regards of the jobs they are running. +Those jobs often have matrix run strategy which runs several different variations of the jobs +(with different Backend type / Python version, type of the tests to run for example). The following chapter +describes the workflows that execute for each run. + +Those runs and their corresponding ``Build Images`` runs are only executed in main ``apache/airflow`` +repository, they are not executed in forks - we want to be nice to the contributors and not use their +free build minutes on GitHub Actions. + +Workflows +========= + +Build Images Workflow +--------------------- + +This workflow has two purposes - it builds images for the CI Workflow but also it cancels duplicate or +failed builds in order to save job time in GitHub Actions and allow for faster feedback for developers. + +It's a special type of workflow: ``workflow_run`` which means that it is triggered by other workflows (in our +case it is triggered by the ``CI Build`` workflow). This also means that the workflow has Write permission to +the Airflow repository and it can - for example - push to the GitHub registry the images used by CI Builds +which means that the images can be built only once and reused by all the CI jobs (including the matrix jobs). +We've implemented it in the way that the CI Build running will wait until the images are built by the +"Build Images" workflow. + +It's possible to disable this feature and go back to the previous behaviour via +``GITHUB_REGISTRY_WAIT_FOR_IMAGE`` flag in the "Build Workflow image". Setting it to "false" switches back to +the behaviour that each job builds own image. + +You can also switch back to jobs building the images on its own on the fork level by setting +``AIRFLOW_GITHUB_REGISTRY_WAIT_FOR_IMAGE`` secret to ``false``. This will disable pushing the "RUN_ID" +images to GitHub Registry and all the images will be built locally by each job. It is about 20% +slower for the whole build on average, but it does not require to have access to push images to +GitHub, which sometimes might be not available (depending on the account status). + +The write permission also allows to cancel duplicate workflows. It is not possible for the Pull Request +CI Builds run from the forks as they have no Write permission allowing them to cancels running workflows. +In our case we perform several different cancellations: + +* we cancel duplicate "CI Build" workflow runs s (i.e. workflows from the same repository and branch that + were started in quick succession - this allows to save workers that would have been busy running older + version of the same Pull Request (usually with fix-ups) and free them for other runs. + +* we cancel duplicate "Build Images" workflow runs for the same reasons. The "Build Images" builds run image + builds which takes quite some time, so pushing a fixup quickly on the same branch will also cancel the + past "Build Images" workflows. + +* last, but not least - we cancel any of the "CI Build" workflow runs that failed in some important jobs. + This is another optimisations - GitHub does not have "fail-fast" on the whole run and this cancelling + effectively implements "fail-fast" of runs for some important jobs. Note that it only works when you + submit new PRs or push new changes. In case the jobs failed and no new PR is pushed after that, the whole + run will run to completion. + +The workflow has the following jobs: + ++---------------------------+---------------------------------------------+ +| Job | Description | +| | | ++===========================+=============================================+ +| Cancel workflow runs | Cancels duplicated and failed workflows | ++---------------------------+---------------------------------------------+ +| Build Info | Prints detailed information about the build | ++---------------------------+---------------------------------------------+ +| Build CI/PROD images | Builds all configured CI and PROD images | ++---------------------------+---------------------------------------------+ + +The images are stored in the `GitHub Registry `_ and the +names of those images follow the patterns described in +`Naming conventions for stored images <#naming-conventions-for-stored-images>`_ + +Image building is configured in "fail-fast" mode. When any of the images +fails to build, it cancels other builds and the source "CI Build" workflow run +that triggered it. + + +CI Build Workflow +----------------- + +This workflow is a regular workflow that performs all checks of Airflow code. + ++---------------------------+----------------------------------------------+-------+-------+------+ +| Job | Description | PR | Push | CRON | +| | | | Merge | (1) | ++===========================+==============================================+=======+=======+======+ +| Build info | Prints detailed information about the build | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Helm tests | Runs tests for the Helm chart | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Test OpenAPI client gen | Tests if OpenAPIClient continues to generate | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| CI Images | Waits for CI Images (3) | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Static checks | Performs static checks without pylint | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Static checks: pylint | Performs pylint static checks | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Build docs | Builds documentation | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Spell check docs | Spell check for documentation | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Backport packages | Prepares Backport Packages for 1.10 Airflow | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Trigger tests | Checks if tests should be triggered | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Tests [Pg/Msql/Sqlite] | Run all the Pytest tests for Python code | Yes(2)| Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Quarantined tests | Flaky tests that we need to fix (5) | Yes(2)| Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Upload coverage | Uploads test coverage from all the tests | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| PROD Images | Waits for CI Images (3) | Yes | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Tests Kubernetes | Run Kubernetes test | Yes(2)| Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Push PROD images | Pushes PROD images to GitHub Registry (4) | - | Yes | - | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Push CI images | Pushes CI images to GitHub Registry (4) | - | Yes | - | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Constraints | Upgrade constraints to latest ones (4) | - | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Constraints push | Pushes all upgraded constraints (4) | - | Yes | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ +| Tag Repo nightly | Tags the repository with nightly tag (6) | - | - | Yes | ++---------------------------+----------------------------------------------+-------+-------+------+ + + +Comments: + + (1) CRON jobs builds images from scratch - to test if everything works properly for clean builds + (2) The tests are run when the Trigger Tests job determine that important files change (this allows + for example "no-code" changes to build much faster) + (3) The jobs wait for CI images if ``GITHUB_REGISTRY_WAIT_FOR_IMAGE`` variable is set to "true". + You can set it to "false" to disable using shared images - this is slower though as the images + are rebuilt in every job that needs them. You can also set your own fork's secret + ``AIRFLOW_GITHUB_REGISTRY_WAIT_FOR_IMAGE`` to ``false`` to trigger the same behaviour. + (4) PROD and CI images are pushed as "latest" to DockerHub registry and constraints are upgraded only if all + tests are successful. Note that images are not pushed in CRON jobs because they are rebuilt from + scratch and we want to push incremental changes to the DockerHub registry. + (5) Flaky tests never fail in regular builds. See the next chapter where our approach to flaky tests + is explained. + (6) Nightly tag is pushed to the repository only in CRON job and only if all tests pass. This + causes the DockerHub images are built automatically and made available to developers. + +Scheduled quarantined builds +---------------------------- + +This workflow runs only quarantined tests. Those tests do not fail the build even if some tests fail (only if +the whole pytest execution fails). Instead this workflow updates one of the issues where we keep status +of quarantined tests. Once the test succeeds in NUM_RUNS subsequent runs, it is marked as stable and +can be removed from quarantine. You can read more about quarantine in ``_ + +The issues are only updated if the test is run as direct push or scheduled run and only in the +``apache/airflow`` repository - so that the issues are not updated in forks. + +The issues that gets updated are different for different branches: + +* master: `Quarantine tests master `_ +* v1-10-stable: `Quarantine tests v1-10-stable `_ +* v1-10-test: `Quarantine tests v1-10-test `_ + +Those runs and their corresponding ``Build Images`` runs are only executed in main ``apache/airflow`` +repository, they are not executed in forks - we want to be nice to the contributors and not use their +free build minutes on GitHub Actions. + +Force sync master from apache/airflow +------------------------------------- + +This is manually triggered workflow (via GitHub UI manual run) that should only be run in GitHub forks. +When triggered, it will force-push the "apache/airflow" master to the fork's master. It's the easiest +way to sync your fork master to the Apache Airflow's one. + +Delete old artifacts +-------------------- + +This workflow is introduced, to delete old artifacts from the GitHub Actions build. We set it to +delete old artifacts that are > 7 days old. It only runs for the 'apache/airflow' repository. + +We also have a script that can help to clean-up the old artifacts: +`remove_artifacts.sh `_ + +CodeQL scan +----------- + +The `CodeQL `_ security scan uses GitHub security scan framework to scan our code for security violations. +It is run for JavaScript and python code. + +Naming conventions for stored images +==================================== + +The images produced during the CI builds are stored in the +`GitHub Registry `_ + +The images are stored with both "latest" tag (for last master push image that passes all the tests as well +with the tags indicating the origin of the image. + +The image names follow the patterns: + ++--------------+----------------------------+--------------------------------+--------------------------------------------------------------------------------------------+ +| Image | Name pattern | Tag for format | Comment | ++==============+============================+================================+============================================================================================+ +| Python image | python | -slim-buster- | Base python image used by both production and CI image. | +| | | -slim-buster- | Python maintainer release new versions of those image with security fixes every few weeks. | ++--------------+----------------------------+--------------------------------+--------------------------------------------------------------------------------------------+ +| CI image | -python-ci | | CI image - this is the image used for most of the tests. | +| | | | | ++--------------+----------------------------+--------------------------------+--------------------------------------------------------------------------------------------+ +| PROD Build | -python-build | | Production Build image - this is the "build" segment of production image. | +| image | | | It contains build-essentials and all necessary packages to install PIP packages. | ++--------------+----------------------------+--------------------------------+--------------------------------------------------------------------------------------------+ +| PROD image | -python | | Production image. This is the actual production image - optimized for size. | +| | | | It contains only compiled libraries and minimal set of dependencies to run Airflow. | ++--------------+----------------------------+--------------------------------+--------------------------------------------------------------------------------------------+ + +* might be either "master" or "v1-10-test" or "v2-0-test" +* - Python version (Major + Minor). For "master" and "v2-0-test" should be in ["3.6", "3.7", "3.8"]. For + v1-10-test it should be in ["2.7", "3.5", "3.6". "3.7", "3.8"]. +* - GitHub Actions RUN_ID. You can get it from CI action job outputs (run id is printed in + logs and displayed as part of the step name. All PRs belong to some RUN_ID and this way you can + pull the very exact version of image used in that RUN_ID +* - for images that get merged to "master", "v2-0-test" of "v1-10-test" the images are also tagged + with the commit SHA of that particular commit. This way you can easily find the image that was used + for testing for that "master", "v2-0-test" or "v1-10-test" test run. + +Reproducing CI Runs locally +=========================== + +Since we store images from every CI run, you should be able easily reproduce any of the CI build problems +locally. You can do it by pulling and using the right image and running it with the right docker command, +For example knowing that the CI build had 210056909 RUN_ID (you can find it from GitHub CI logs): + +.. code-block:: bash + + docker pull docker.pkg.github.com/apache/airflow/master-python3.6-ci:210056909 + + docker run -it docker.pkg.github.com/apache/airflow/master-python3.6-ci:210056909 + + +But you usually need to pass more variables amd complex setup if you want to connect to a database or +enable some integrations. Therefore it is easiest to use `Breeze `_ for that. For example if +you need to reproduce a MySQL environment with kerberos integration enabled for run 210056909, in python +3.8 environment you can run: + +.. code-block:: bash + + ./breeze --github-image-id 210056909 --python 3.8 --integration kerberos + +You will be dropped into a shell with the exact version that was used during the CI run and you will +be able to run pytest tests manually, easily reproducing the environment that was used in CI. Note that in +this case, you do not need to checkout the sources that were used for that run - they are already part of +the image - but remember that any changes you make in those sources are lost when you leave the image as +the sources are not mapped from your host machine. + +CI Sequence diagrams +==================== + +Sequence diagrams are shown of the flow happening during the CI builds. + +Pull request flow from fork +--------------------------- + +.. image:: images/ci/pull_request_ci_flow.png + :align: center + :alt: Pull request flow from fork + + +Direct Push/Merge flow +---------------------- + +.. image:: images/ci/push_ci_flow.png + :align: center + :alt: Direct Push/Merge flow + +Scheduled build flow +--------------------- + +.. image:: images/ci/scheduled_ci_flow.png + :align: center + :alt: Scheduled build flow diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 1c5c4829ca9ad..bc641178a7d07 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -42,8 +42,7 @@ to follow it and apply to the programme and follow up with the community. Report Bugs ----------- -Report bugs through `Apache -JIRA `__. +Report bugs through `GitHub `__. Please report relevant information and preferably code that exhibits the problem. @@ -51,16 +50,16 @@ problem. Fix Bugs -------- -Look through the JIRA issues for bugs. Anything is open to whoever wants to +Look through the GitHub issues for bugs. Anything is open to whoever wants to implement it. Implement Features ------------------ -Look through the `Apache -JIRA `__ for features. +Look through the `GitHub issues labeled "kind:feature" +`__ for features. -Any unassigned "Improvement" issue is open to whoever wants to implement it. +Any unassigned feature request issue is open to whoever wants to implement it. We've created the operators, hooks, macros and executors we needed, but we've made sure that this part of Airflow is extensible. New operators, hooks, macros @@ -76,8 +75,7 @@ articles. Submit Feedback --------------- -The best way to send feedback is to open an issue on `Apache -JIRA `__. +The best way to send feedback is to `open an issue on GitHub `__. If you are proposing a new feature: @@ -86,37 +84,265 @@ If you are proposing a new feature: - Remember that this is a volunteer-driven project, and that contributions are welcome :) -Documentation + +Roles ============= -The latest API documentation is usually available -`here `__. +There are several roles within the Airflow Open-Source community. -To generate a local version: -1. Set up an Airflow development environment. +PMC Member +----------- +The PMC (Project Management Committee) is a group of maintainers that drives changes in the way that +Airflow is managed as a project. -2. Install the ``doc`` extra. +Considering Apache, the role of the PMC is primarily to ensure that Airflow conforms to Apache's processes +and guidelines. -.. code-block:: bash +Committers/Maintainers +---------------------- + +Committers are community members that have write access to the project’s repositories, i.e., they can modify the code, +documentation, and website by themselves and also accept other contributions. + +The official list of committers can be found `here `__. + +Additionally, committers are listed in a few other places (some of these may only be visible to existing committers): + +* https://whimsy.apache.org/roster/ppmc/airflow +* https://github.com/orgs/apache/teams/airflow-committers/members + +Committers are responsible for: + +* Championing one or more items on the `Roadmap `__ +* Reviewing & Merging Pull-Requests +* Scanning and responding to GitHub issues +* Responding to questions on the dev mailing list (dev@airflow.apache.org) + +Becoming a Committer +-------------------- + +There is no strict protocol for becoming a committer. +Candidates for new committers are typically people that are active contributors and community members. - pip install -e '.[doc]' +The key aspects of a committer are: +* Consistent contributions over the past 6 months +* Understanding of Airflow Core or has displayed a holistic understanding of a particular part and made + contributions towards a more strategic goal +* Understanding of contributor/committer guidelines: `Contributors' Guide `__ +* Quality of the commits +* Visibility in community discussions (dev mailing list, Slack and GitHub) +* Testing Release Candidates -3. Generate and serve the documentation as follows: + +Contributors +------------ + +A contributor is anyone who wants to contribute code, documentation, tests, ideas, or anything to the +Apache Airflow project. + +Contributors are responsible for: + +* Fixing bugs +* Adding features +* Championing one or more items on the `Roadmap `__. + +Contribution Workflow +===================== + +Typically, you start your first contribution by reviewing open tickets +at `GitHub issues `__. + +If you create pull-request, you don't have to create an issue first, but if you want, you can do it. +Creating an issue will allow you to collect feedback or share plans with other people. + +For example, you want to have the following sample ticket assigned to you: +`#7782: Add extra CC: to the emails sent by Airflow `_. + +In general, your contribution includes the following stages: + +.. image:: images/workflow.png + :align: center + :alt: Contribution Workflow + +1. Make your own `fork `__ of + the Apache Airflow `main repository `__. + +2. Create a `local virtualenv `_, + initialize the `Breeze environment `__, and + install `pre-commit framework `__. + If you want to add more changes in the future, set up your fork and enable GitHub Actions. + +3. Join `devlist `__ + and set up a `Slack account `__. + +4. Make the change and create a `Pull Request from your fork `__. + +5. Ping @ #development slack, comment @people. Be annoying. Be considerate. + +Step 1: Fork the Apache Airflow Repo +------------------------------------ +From the `apache/airflow `_ repo, +`create a fork `_: + +.. image:: images/fork.png + :align: center + :alt: Creating a fork + + +Step 2: Configure Your Environment +---------------------------------- +Configure the Docker-based Breeze development environment and run tests. + +You can use the default Breeze configuration as follows: + +1. Install the latest versions of the Docker Community Edition + and Docker Compose and add them to the PATH. + +2. Enter Breeze: ``./breeze`` + + Breeze starts with downloading the Airflow CI image from + the Docker Hub and installing all required dependencies. + +3. Enter the Docker environment and mount your local sources + to make them immediately visible in the environment. + +4. Create a local virtualenv, for example: .. code-block:: bash - cd docs - ./build.sh - ./start_doc_server.sh + mkvirtualenv myenv --python=python3.6 -.. note:: - The docs build script ``build.sh`` requires bash 4.0 or greater. - If you are building on mac, you can install latest version of bash with homebrew. +5. Initialize the created environment: + +.. code-block:: bash + + ./breeze initialize-local-virtualenv --python 3.6 + +6. Open your IDE (for example, PyCharm) and select the virtualenv you created + as the project's default virtualenv in your IDE. + +Step 3: Connect with People +--------------------------- + +For effective collaboration, make sure to join the following Airflow groups: + +- Mailing lists: + + - Developer’s mailing list ``_ + (quite substantial traffic on this list) + + - All commits mailing list: ``_ + (very high traffic on this list) + + - Airflow users mailing list: ``_ + (reasonably small traffic on this list) + +- `Issues on GitHub `__ + +- `Slack (chat) `__ + +Step 4: Prepare PR +------------------ + +1. Update the local sources to address the issue. + + For example, to address this example issue, do the following: + + * Read about `email configuration in Airflow `__. + + * Find the class you should modify. For the example GitHub issue, + this is `email.py `__. + + * Find the test class where you should add tests. For the example ticket, + this is `test_email.py `__. + * Make sure your fork's master is synced with Apache Airflow's master before you create a branch. See + `How to sync your fork <#how-to-sync-your-fork>`_ for details. + + * Create a local branch for your development. Make sure to use latest + ``apache/master`` as base for the branch. See `How to Rebase PR <#how-to-rebase-pr>`_ for some details + on setting up the ``apache`` remote. Note, some people develop their changes directly in their own + ``master`` branches - this is OK and you can make PR from your master to ``apache/master`` but we + recommend to always create a local branch for your development. This allows you to easily compare + changes, have several changes that you work on at the same time and many more. + If you have ``apache`` set as remote then you can make sure that you have latest changes in your master + by ``git pull apache master`` when you are in the local ``master`` branch. If you have conflicts and + want to override your locally changed master you can override your local changes with + ``git fetch apache; git reset --hard apache/master``. + + * Modify the class and add necessary code and unit tests. + + * Run the unit tests from the `IDE `__ + or `local virtualenv `__ as you see fit. + + * Run the tests in `Breeze `__. + + * Run and fix all the `static checks `__. If you have + `pre-commits installed `__, + this step is automatically run while you are committing your code. If not, you can do it manually + via ``git add`` and then ``pre-commit run``. + +2. Rebase your fork, squash commits, and resolve all conflicts. See `How to rebase PR <#how-to-rebase-pr>`_ + if you need help with rebasing your change. Remember to rebase often if your PR takes a lot of time to + review/fix. This will make rebase process much easier and less painful and the more often you do it, + the more comfortable you will feel doing it. + +3. Re-run static code checks again. + +4. Make sure your commit has a good title and description of the context of your change, enough + for the committer reviewing it to understand why you are proposing a change. Make sure to follow other + PR guidelines described in `pull request guidelines <#pull-request-guidelines>`_. + Create Pull Request! Make yourself ready for the discussion! + +5. Depending on "scope" of your changes, your Pull Request might go through one of few paths after approval. + We run some non-standard workflow with high degree of automation that allows us to optimize the usage + of queue slots in Github Actions. Our automated workflows determine the "scope" of changes in your PR + and send it through the right path: + + * In case of a "no-code" change, approval will generate a comment that the PR can be merged and no + tests are needed. This is usually when the change modifies some non-documentation related rst + files (such as this file). No python tests are run and no CI images are built for such PR. Usually + it can be approved and merged few minutes after it is submitted (unless there is a big queue of jobs). + + * In case of change involving python code changes or documentation changes, a subset of full test matrix + will be executed. This subset of tests perform relevant tests for single combination of python, backend + version and only builds one CI image and one PROD image. Here the scope of tests depends on the + scope of your changes: + + * when your change does not change "core" of Airflow (Providers, CLI, WWW, Helm Chart) you will get the + comment that PR is likely ok to be merged without running "full matrix" of tests. However decision + for that is left to committer who approves your change. The committer might set a "full tests needed" + label for your PR and ask you to rebase your request or re-run all jobs. PRs with "full tests needed" + run full matrix of tests. + + * when your change changes the "core" of Airflow you will get the comment that PR needs full tests and + the "full tests needed" label is set for your PR. Additional check is set that prevents from + accidental merging of the request until full matrix of tests succeeds for the PR. + + * when your change has "upgrade to newer dependencies" label set, constraints will be automatically + upgraded to latest constraints matching your setup.py. This is useful in case you want to force + upgrade to a latest version of dependencies. You can ask committers to set the label for you + when you need it in your PR. + + More details about the PR workflow be found in `PULL_REQUEST_WORKFLOW.rst `_. + + +Step 5: Pass PR Review +---------------------- + +.. image:: images/review.png + :align: center + :alt: PR Review + +Note that committers will use **Squash and Merge** instead of **Rebase and Merge** +when merging PRs and your commit will be squashed to single commit. + +You need to have review of at least one committer (if you are committer yourself, it has to be +another committer). Ideally you should have 2 or more committers reviewing the code that touches +the core of Airflow. -If you are creating ``example_dags`` directory, you need to create ``example_dags/__init__.py`` with Apache license or copy another ``__init__.py`` file that contains the necessary license. Pull Request Guidelines ======================= @@ -127,30 +353,21 @@ these guidelines: - Include tests, either as doctests, unit tests, or both, to your pull request. - The airflow repo uses `Travis CI `__ to + The airflow repo uses `GitHub Actions `__ to run the tests and `codecov `__ to track - coverage. You can set up both for free on your fork (see - `Travis CI Testing Framework `__ usage guidelines). - It will help you make sure you do not break the build with your PR and - that you help increase coverage. + coverage. You can set up both for free on your fork. It will help you make sure you do not + break the build with your PR and that you help increase coverage. + +- Follow our project's `Coding style and best practices`_. + + These are things that aren't currently enforced programmatically (either because they are too hard or just + not yet done.) - `Rebase your fork `__, squash commits, and resolve all conflicts. - When merging PRs, wherever possible try to use **Squash and Merge** instead of **Rebase and Merge**. -- Make sure every pull request introducing code changes has an associated - `JIRA `__ - ticket. The JIRA link should also be added to the PR description. In case of documentation only changes - the JIRA ticket is not necessary. - -- Preface your commit's subject & PR title with **[AIRFLOW-NNNN] COMMIT_MSG** where *NNNN* - is the JIRA number. For example: [AIRFLOW-5574] Fix Google Analytics script loading. In case of - documentation only changes you should put "[AIRFLOW-XXXX]" instead. - We compose Airflow release notes from all commit titles in a release. By placing the JIRA number in the - commit title and hence in the release notes, we let Airflow users look into - JIRA and GitHub PRs for more details about a particular change. - - Add an `Apache License `__ header to all new files. @@ -167,7 +384,7 @@ these guidelines: - Run tests locally before opening PR. -- Make sure the pull request works for Python 2.7, 3.5 and 3.6. +- Make sure the pull request works for Python 2.7, 3.5, 3.6, 3.7 and 3.8. - Adhere to guidelines for commit messages described in this `article `__. This makes the lives of those who come after you a lot easier. @@ -187,6 +404,13 @@ usually these are developers with the release manager permissions. Once the branch is stable, the ``v1-10-stable`` branch is synchronized with ``v1-10-test``. The ``v1-10-stable`` branch is used to release ``1.10.x`` releases. +The general approach is that cherry-picking a commit that has already had a PR and unit tests run +against main is done to ``v1-10-test`` branch, but PRs from contributors towards 1.10 should target +``v1-10-stable`` branch. + +The ``v1-10-test`` branch and ``v1-10-stable`` ones are merged just before the release and that's the +time when they converge. + Development Environments ======================== @@ -281,8 +505,9 @@ Benefits: where all these services are available and can be used by tests automatically. -- Breeze environment is almost the same as used in `Travis CI `__ automated builds. - So, if the tests run in your Breeze environment, they will work in Travis CI as well. +- Breeze environment is almost the same as used in the CI automated builds. + So, if the tests run in your Breeze environment, they will work in the CI as well. + See ``_ for details about Airflow CI. Limitations: @@ -303,37 +528,144 @@ Limitations: They are optimized for repeatability of tests, maintainability and speed of building rather than production performance. The production images are not yet officially published. + +Airflow dependencies +==================== + Extras ------ There are a number of extras that can be specified when installing Airflow. Those extras can be specified after the usual pip install - for example -``pip install -e .[gcp]``. For development purpose there is a ``devel`` extra that +``pip install -e .[ssh]``. For development purpose there is a ``devel`` extra that installs all development dependencies. There is also ``devel_ci`` that installs -all dependencies needed in CI envioronment. +all dependencies needed in the CI environment. + +.. note:: + On 30th of November 2020, new version of PIP (20.3) has been released with a new, 2020 resolver. + This resolver does not yet work with Apache Airflow and might leads to errors in installation - + depends on your choice of extras. In order to install Airflow you need to either downgrade + pip to version 20.2.4 ``pip upgrade --pip==20.2.4`` or, in case you use Pip 20.3, you need to add option + ``--use-deprecated legacy-resolver`` to your pip install command. + This is the full list of those extras: .. START EXTRAS HERE -all, all_dbs, async, atlas, aws, azure, azure_blob_storage, azure_container_instances, azure_cosmos, -azure_data_lake, cassandra, celery, cgroups, cloudant, crypto, dask, databricks, datadog, devel, -devel_azure, devel_ci, devel_hadoop, doc, docker, druid, elasticsearch, emr, gcp, gcp_api, -github_enterprise, google_auth, grpc, hashicorp, hdfs, hive, jdbc, jira, kerberos, kubernetes, ldap, -mongo, mssql, mysql, oracle, papermill, password, pinot, postgres, presto, qds, rabbitmq, redis, s3, -salesforce, samba, segment, sendgrid, sentry, slack, snowflake, ssh, statsd, vertica, virtualenv, -webhdfs, winrm +all, all_dbs, amazon, apache.atlas, apache.cassandra, apache.druid, apache.hdfs, apache.hive, +apache.pinot, apache.presto, apache.webhdfs, async, atlas, aws, azure, azure_blob_storage, +azure_container_instances, azure_cosmos, azure_data_lake, azure_secrets, cassandra, celery, cgroups, +cloudant, cncf.kubernetes, crypto, dask, databricks, datadog, devel, devel_all, devel_azure, +devel_ci, devel_hadoop, doc, docker, druid, elasticsearch, emr, gcp, gcp_api, github_enterprise, +google, google_auth, grpc, hashicorp, hdfs, hive, jdbc, jira, kerberos, kubernetes, ldap, +microsoft.azure, microsoft.mssql, microsoft.winrm, mongo, mssql, mysql, oracle, papermill, password, +pinot, postgres, presto, qds, rabbitmq, redis, s3, salesforce, samba, segment, sendgrid, sentry, +slack, snowflake, ssh, statsd, vertica, virtualenv, webhdfs, winrm .. END EXTRAS HERE +Provider packages +----------------- -Airflow dependencies --------------------- +Airflow 2.0 is split into core and providers. They are delivered as separate packages: + +* ``apache-airflow`` - core of Apache Airflow +* ``apache-airflow-providers-*`` - More than 50 provider packages to communicate with external services + +In Airflow 1.10 all those providers were installed together within one single package and when you installed +airflow locally, from sources, they were also installed. In Airflow 2.0, providers are separated out, +and not installed together with the core, unless you set ``INSTALL_PROVIDERS_FROM_SOURCES`` environment +variable to ``true``. + +In Breeze - which is a development environment, ``INSTALL_PROVIDERS_FROM_SOURCES`` variable is set to true, +but you can add ``--skip-installing-airflow-providers-from-sources`` flag to Breeze to skip installing providers when +building the images. + +One watch-out - providers are still always installed (or rather available) if you install airflow from +sources using ``-e`` (or ``--editable``) flag. In such case airflow is read directly from the sources +without copying airflow packages to the usual installation location, and since 'providers' folder is +in this airflow folder - the providers package is importable. + +Some of the packages have cross-dependencies with other providers packages. This typically happens for +transfer operators where operators use hooks from the other providers in case they are transferring +data between the providers. The list of dependencies is maintained (automatically with pre-commits) +in the ``airflow/providers/dependencies.json``. Pre-commits are also used to generate dependencies. +The dependency list is automatically used during pypi packages generation. + +Cross-dependencies between provider packages are converted into extras - if you need functionality from +the other provider package you can install it adding [extra] after the +``apache-airflow-backport-providers-PROVIDER`` for example: +``pip install apache-airflow-backport-providers-google[amazon]`` in case you want to use GCP +transfer operators from Amazon ECS. + +.. note:: + On 30th of November 2020, new version of PIP (20.3) has been released with a new, 2020 resolver. + This resolver does not yet work with Apache Airflow and might leads to errors in installation - + depends on your choice of extras. In order to install Airflow you need to either downgrade + pip to version 20.2.4 ``pip upgrade --pip==20.2.4`` or, in case you use Pip 20.3, you need to add option + ``--use-deprecated legacy-resolver`` to your pip install command. + + +If you add a new dependency between different providers packages, it will be detected automatically during +pre-commit phase and pre-commit will fail - and add entry in dependencies.json so that the package extra +dependencies are properly added when package is installed. + +You can regenerate the whole list of provider dependencies by running this command (you need to have +``pre-commits`` installed). + +.. code-block:: bash + + pre-commit run build-providers-dependencies + + +Here is the list of packages and their extras: + + + .. START PACKAGE DEPENDENCIES HERE + +========================== =========================== +Package Extras +========================== =========================== +amazon apache.hive,google,imap,mongo,mysql,postgres,ssh +apache.druid apache.hive +apache.hive amazon,microsoft.mssql,mysql,presto,samba,vertica +apache.livy http +dingding http +discord http +google amazon,apache.cassandra,cncf.kubernetes,facebook,microsoft.azure,microsoft.mssql,mysql,postgres,presto,salesforce,sftp +hashicorp google +microsoft.azure google,oracle +microsoft.mssql odbc +mysql amazon,presto,vertica +opsgenie http +postgres amazon +sftp ssh +slack http +snowflake slack +========================== =========================== + + .. END PACKAGE DEPENDENCIES HERE + +Backport providers +------------------ + +You can also build backport provider packages for Airflow 1.10. They aim to provide a bridge when users +of Airflow 1.10 want to migrate to Airflow 2.0. The backport packages are named similarly to the +provider packages, but with "backport" added: + +* ``apache-airflow-backport-provider-*`` + +Those backport providers are automatically refactored to work with Airflow 1.10.* and have a few +limitations described in those packages. + +Dependency management +===================== Airflow is not a standard python project. Most of the python projects fall into one of two types - application or library. As described in [StackOverflow Question](https://stackoverflow.com/questions/28509481/should-i-pin-my-python-dependencies-versions) -decision whether to pin (freeze) requirements for a python project depdends on the type. For +decision whether to pin (freeze) dependency versions for a python project depends on the type. For applications, dependencies should be pinned, but for libraries, they should be open. For application, pinning the dependencies makes it more stable to install in the future - because new @@ -343,80 +675,75 @@ be open to allow several different libraries with the same requirements to be in The problem is that Apache Airflow is a bit of both - application to install and library to be used when you are developing your own operators and DAGs. -This - seemingly unsolvable - puzzle is solved by having pinned requirement files. Those are available -as of airflow 1.10.10. +This - seemingly unsolvable - puzzle is solved by having pinned constraints files. Those are available +as of airflow 1.10.10 and further improved with 1.10.12 (moved to separate orphan branches) -Pinned requirement files ------------------------- +Pinned constraint files +======================= By default when you install ``apache-airflow`` package - the dependencies are as open as possible while -still allowing the apache-airflow package to install. This means that 'apache-airflow' package might fail to +still allowing the apache-airflow package to install. This means that ``apache-airflow`` package might fail to install in case a direct or transitive dependency is released that breaks the installation. In such case when installing ``apache-airflow``, you might need to provide additional constraints (for example ``pip install apache-airflow==1.10.2 Werkzeug<1.0.0``) -However we now have ``requirements-python.txt`` file generated -automatically and committed in the requirements folder based on the set of all latest working and tested -requirement versions. Those ``requirement-python.txt`` files can be used as -constraints file when installing Apache Airflow - either from the sources - -.. code-block:: bash - - pip install -e . --constraint requirements/requirements-python3.6.txt +.. note:: + On November 2020, new version of PIP (20.3) has been released with a new, 2020 resolver. This resolver + does not yet work with Apache Airflow and might leads to errors in installation - depends on your choice + of extras. In order to install Airflow you need to either downgrade pip to version 20.2.4 + ``pip upgrade --pip==20.2.4`` or, in case you use Pip 20.3, you need to add option + ``--use-deprecated legacy-resolver`` to your pip install command. -or from the pypi package +However we now have ``constraints-.txt`` files generated +automatically and committed to orphan ``constraints-master`` and ``constraint-1-10`` branches based on +the set of all latest working and tested dependency versions. Those +``constraints-.txt`` files can be used as +constraints file when installing Apache Airflow - either from the sources: .. code-block:: bash - pip install apache-airflow --constraint requirements/requirements-python3.6.txt + pip install -e . \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-1-10/constraints-3.6.txt" - -This works also with extras - for example: +or from the pypi package: .. code-block:: bash - pip install .[gcp] --constraint requirements/requirements-python3.6.txt - + pip install apache-airflow \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-1-10/constraints-3.6.txt" -It is also possible to use constraints directly from github using tag/version name: +This works also with extras - for example: .. code-block:: bash - pip install apache-airflow[gcp]==1.10.10 \ - --constraint https://raw.githubusercontent.com/apache/airflow/1.10.10/requirements/requirements-python3.6.txt - -There are different set of fixed requirements for different python major/minor versions and you should -use the right requirements file for the right python version. - -The ``requirements-python.txt`` file MUST be regenerated every time after -the ``setup.py`` is updated. This is checked automatically in Travis CI build. There are separate -jobs for each python version that checks if the requirements should be updated. - -If they are not updated, you should regenerate the requirements locally using Breeze as described below. + pip install .[ssh] \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-master/constraints-3.6.txt" -Generating requirement files ----------------------------- -This should be done every time after you modify setup.py file. You can generate requirement files -using `Breeze `_ . Simply use those commands: +As of apache-airflow 1.10.12 it is also possible to use constraints directly from GitHub using specific +tag/hash name. We tag commits working for particular release with constraints- tag. So for example +fixed valid constraints 1.10.12 can be used by using ``constraints-1.10.12`` tag: .. code-block:: bash - breeze generate-requirements --python 3.7 + pip install apache-airflow[ssh]==1.10.12 \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-1.10.12/constraints-3.6.txt" -.. code-block:: bash - - breeze generate-requirements --python 3.6 +There are different set of fixed constraint files for different python major/minor versions and you should +use the right file for the right python version. -Note that when you generate requirements this way, you might update to latest version of requirements -that were released since the last time so during tests you might get errors unrelated to your change. -In this case the easiest way to fix it is to limit the culprit dependency to the previous version -with ```` constraint added in setup.py. +The ``constraints-.txt`` will be automatically regenerated by CI cron job +every time after the ``setup.py`` is updated and pushed if the tests are successful. There are separate +jobs for each python version. Backport providers packages --------------------------- +**NOTE:** In case of problems with installation / development of backport packages +check `troubleshooting installing backport packages `_. + Since we are developing new operators in the master branch, we prepared backport packages ready to be installed for Airflow 1.10.* series. Those backport operators (the tested ones) are going to be released in PyPi and we are going to maintain the list at @@ -472,13 +799,46 @@ slack http .. END PACKAGE DEPENDENCIES HERE +Documentation +============= + +The latest API documentation (for the master branch) is usually available +`here `__. + +To generate a local version you can use ``_. + +The documentation build consists of verifying consistency of documentation and two steps: + +* spell checking +* building documentation + +You can only run one of the steps via ``--spellcheck-only`` or ``--docs-only``. + +.. code-block:: bash + + ./breeze build-docs + +Also documentation is available as downloadable artifact in GitHub Actions after the CI builds your PR. + +**Known issues:** + +If you are creating a new directory for new integration in the ``airflow.providers`` package, +you should also update the ``docs/autoapi_templates/index.rst`` file. + +If you are creating new ``hooks``, ``sensors``, ``operators`` directory in +the ``airflow.providers`` package, you should also update +the ``docs/operators-and-hooks-ref.rst`` file. + +If you are creating ``example_dags`` directory, you need to create ``example_dags/__init__.py`` with Apache +license or copy another ``__init__.py`` file that contains the necessary license. + Static code checks ================== We check our code quality via static code checks. See `STATIC_CODE_CHECKS.rst `_ for details. -Your code must pass all the static code checks in Travis CI in order to be eligible for Code Review. +Your code must pass all the static code checks in the CI in order to be eligible for Code Review. The easiest way to make sure your code is good before pushing is to use pre-commit checks locally as described in the static code checks documentation. @@ -523,6 +883,67 @@ If this function is designed to be called by "end-users" (i.e. DAG authors) then ... # You SHOULD not commit the session here. The wrapper will take care of commit()/rollback() if exception +Naming Conventions for provider packages +---------------------------------------- + +In Airflow 2.0 we standardized and enforced naming for provider packages, modules and classes. +those rules (introduced as AIP-21) were not only introduced but enforced using automated checks +that verify if the naming conventions are followed. Here is a brief summary of the rules, for +detailed discussion you can go to [AIP-21 Changes in import paths](https://cwiki.apache.org/confluence/display/AIRFLOW/AIP-21%3A+Changes+in+import+paths) + +The rules are as follows: + +* Provider packages are all placed in 'airflow.providers' + +* Providers are usually direct sub-packages of the 'airflow.providers' package but in some cases they can be + further split into sub-packages (for example 'apache' package has 'cassandra', 'druid' ... providers ) out + of which several different provider packages are produced (apache.cassandra, apache.druid). This is + case when the providers are connected under common umbrella but very loosely coupled on the code level. + +* In some cases the package can have sub-packages but they are all delivered as single provider + package (for example 'google' package contains 'ads', 'cloud' etc. sub-packages). This is in case + the providers are connected under common umbrella and they are also tightly coupled on the code level. + +* Typical structure of provider package: + * example_dags -> example DAGs are stored here (used for documentation and System Tests) + * hooks -> hooks are stored here + * operators -> operators are stored here + * sensors -> sensors are stored here + * secrets -> secret backends are stored here + * transfers -> transfer operators are stored here + +* Module names do not contain word "hooks", "operators" etc. The right type comes from + the package. For example 'hooks.datastore' module contains DataStore hook and 'operators.datastore' + contains DataStore operators. + +* Class names contain 'Operator', 'Hook', 'Sensor' - for example DataStoreHook, DataStoreExportOperator + +* Operator name usually follows the convention: ``Operator`` + (BigQueryExecuteQueryOperator) is a good example + +* Transfer Operators are those that actively push data from one service/provider and send it to another + service (might be for the same or another provider). This usually involves two hooks. The convention + for those ``ToOperator``. They are not named *TransferOperator nor *Transfer. + +* Operators that use external service to perform transfer (for example CloudDataTransferService operators + are not placed in "transfers" package and do not have to follow the naming convention for + transfer operators. + +* It is often debatable where to put transfer operators but we agreed to the following criteria: + + * We use "maintainability" of the operators as the main criteria - so the transfer operator + should be kept at the provider which has highest "interest" in the transfer operator + + * For Cloud Providers or Service providers that usually means that the transfer operators + should land at the "target" side of the transfer + +* Secret Backend name follows the convention: ``Backend``. + +* Tests are grouped in parallel packages under "tests.providers" top level package. Module name is usually + ``test_.py``, + +* System tests (not yet fully automated but allowing to run e2e testing of particular provider) are + named with _system.py suffix. Test Infrastructure =================== @@ -534,16 +955,16 @@ We support the following types of tests: and `local virtualenv `_. * **Integration tests** are available in the Breeze development environment - that is also used for Airflow Travis CI tests. Integration test are special tests that require + that is also used for Airflow's CI tests. Integration test are special tests that require additional services running, such as Postgres, Mysql, Kerberos, etc. * **System tests** are automatic tests that use external systems like - Google Cloud Platform. These tests are intended for an end-to-end DAG execution. + Google Cloud. These tests are intended for an end-to-end DAG execution. For details on running different types of Airflow tests, see `TESTING.rst `_. Metadata Database Updates -============================== +========================= When developing features, you may need to persist information to the metadata database. Airflow has `Alembic `__ built-in @@ -584,7 +1005,7 @@ To install yarn on macOS: .. code-block:: bash - brew install node --without-npm + brew install node brew install yarn yarn config set prefix ~/.yarn @@ -623,7 +1044,7 @@ could get a reproducible build. See the `Yarn docs Generate Bundled Files with yarn ----------------------------------- +-------------------------------- To parse and generate bundled files for Airflow, run either of the following commands: @@ -637,13 +1058,13 @@ commands: yarn run dev -Follow Javascript Style Guide +Follow JavaScript Style Guide ----------------------------- We try to enforce a more consistent style and follow the JS community guidelines. -Once you add or modify any javascript code in the project, please make sure it +Once you add or modify any JavaScript code in the project, please make sure it follows the guidelines defined in `Airbnb JavaScript Style Guide `__. @@ -659,223 +1080,110 @@ commands: # Check JS code in .js and .html files, report any errors/warnings and fix them if possible yarn run lint:fix -Contribution Workflow Example -============================== - -Typically, you start your first contribution by reviewing open tickets -at `Apache JIRA `__. - -For example, you want to have the following sample ticket assigned to you: -`AIRFLOW-5934: Add extra CC: to the emails sent by Aiflow `_. - -In general, your contribution includes the following stages: - -.. image:: images/workflow.png - :align: center - :alt: Contribution Workflow - -1. Make your own `fork `__ of - the Apache Airflow `main repository `__. - -2. Create a `local virtualenv `_, - initialize the `Breeze environment `__, and - install `pre-commit framework `__. - If you want to add more changes in the future, set up your own `Travis CI - fork `__. - -3. Join `devlist `__ - and set up a `Slack account `__. - -4. Make the change and create a `Pull Request from your fork `__. - -5. Ping @ #development slack, comment @people. Be annoying. Be considerate. - -Step 1: Fork the Apache Repo ----------------------------- -From the `apache/airflow `_ repo, -`create a fork `_: - -.. image:: images/fork.png - :align: center - :alt: Creating a fork - - -Step 2: Configure Your Environment ----------------------------------- -Configure the Docker-based Breeze development environment and run tests. - -You can use the default Breeze configuration as follows: - -1. Install the latest versions of the Docker Community Edition - and Docker Compose and add them to the PATH. - -2. Enter Breeze: ``./breeze`` +How to sync your fork +===================== - Breeze starts with downloading the Airflow CI image from - the Docker Hub and installing all required dependencies. +When you have your fork, you should periodically synchronize the master of your fork with the +Apache Airflow master. In order to do that you can ``git pull --rebase`` to your local git repository from +apache remote and push the master (often with ``--force`` to your fork). There is also an easy +way using ``Force sync master from apache/airflow`` workflow. You can go to "Actions" in your repository and +choose the workflow and manually trigger the workflow using "Run workflow" command. -3. Enter the Docker environment and mount your local sources - to make them immediately visible in the environment. +This will force-push the master from apache/airflow to the master in your fork. Note that in case you +modified the master in your fork, you might loose those changes. -4. Create a local virtualenv, for example: -.. code-block:: bash - - mkvirtualenv myenv --python=python3.6 - -5. Initialize the created environment: - -.. code-block:: bash - - ./breeze --initialize-local-virtualenv - -6. Open your IDE (for example, PyCharm) and select the virtualenv you created - as the project's default virtualenv in your IDE. - -Step 3: Connect with People ---------------------------- - -For effective collaboration, make sure to join the following Airflow groups: - -- Mailing lists: - - - Developer’s mailing list ``_ - (quite substantial traffic on this list) - - - All commits mailing list: ``_ - (very high traffic on this list) - - - Airflow users mailing list: ``_ - (reasonably small traffic on this list) - -- `Issues on Apache’s JIRA `__ - -- `Slack (chat) `__ - -Step 4: Prepare PR ------------------- - -1. Update the local sources to address the JIRA ticket. +How to rebase PR +================ - For example, to address this example JIRA ticket, do the following: +A lot of people are unfamiliar with the rebase workflow in Git, but we think it is an excellent workflow, +providing a better alternative to the merge workflow. We've therefore written a short guide for those who would like to learn it. - * Read about `email configuration in Airflow `__. +As opposed to the merge workflow, the rebase workflow allows us to +clearly separate your changes from the changes of others. It puts the responsibility of rebasing on the +author of the change. It also produces a "single-line" series of commits on the master branch. This +makes it easier to understand what was going on and to find reasons for problems (it is especially +useful for "bisecting" when looking for a commit that introduced some bugs). - * Find the class you should modify. For the example ticket, - this is `email.py `__. +First of all, we suggest you read about the rebase workflow here: +`Merging vs. rebasing `_. This is an +excellent article that describes all the ins/outs of the rebase workflow. I recommend keeping it for future reference. - * Find the test class where you should add tests. For the example ticket, - this is `test_email.py `__. +The goal of rebasing your PR on top of ``apache/master`` is to "transplant" your change on top of +the latest changes that are merged by others. It also allows you to fix all the conflicts +that arise as a result of other people changing the same files as you and merging the changes to ``apache/master``. - * Create a local branch for your development. Make sure to use latest - ``apache/master`` as base for the branch. See `How to Rebase PR <#how-to-rebase-pr>`_ for some details - on setting up the ``apache`` remote. Note - some people develop their changes directy in their own - ``master`` branches - this is OK and you can make PR from your master to ``apache/master`` but we - recommend to always create a local branch for your development. This allows you to easily compare - changes, have several changes that you work on at the same time and many more. - If you have ``apache`` set as remote then you can make sure that you have latest changes in your master - by ``git pull apache master`` when you are in the local ``master`` branch. If you have conflicts and - want to override your locally changed master you can override your local changes with - ``git fetch apache; git reset --hard apache/master``. +Here is how rebase looks in practice: - * Modify the class and add necessary code and unit tests. +1. You first need to add the Apache project remote to your git repository. In this example, we will be adding the remote +as "apache" so you can refer to it easily: - * Run the unit tests from the `IDE `__ - or `local virtualenv `__ as you see fit. +* If you use ssh: ``git remote add apache git@github.com:apache/airflow.git`` +* If you use https: ``git remote add apache https://github.com/apache/airflow.git`` - * Run the tests in `Breeze `__. +2. You then need to make sure that you have the latest master fetched from the ``apache`` repository. You can do this + via: - * Run and fix all the `static checks `__. If you have - `pre-commits installed `__, - this step is automatically run while you are committing your code. If not, you can do it manually - via ``git add`` and then ``pre-commit run``. + ``git fetch apache`` (to fetch apache remote) -2. Rebase your fork, squash commits, and resolve all conflicts. See `How to rebase PR <#how-to-rebase-pr>`_ - if you need help with rebasing your change. Remember to rebase often if your PR takes a lot of time to - review/fix. This will make rebase process much easier and less painful - and the more often you do it, - the more comfortable you will feel doing it. + ``git fetch --all`` (to fetch all remotes) -3. Re-run static code checks again. +3. Assuming that your feature is in a branch in your repository called ``my-branch`` you can easily check + what is the base commit you should rebase from by: -4. Create a pull request with the following title for the sample ticket: - ``[AIRFLOW-5934] Added extra CC: field to the Airflow emails.`` + ``git merge-base my-branch apache/master`` -Make sure to follow other PR guidelines described in `this document <#pull-request-guidelines>`_. + This will print the HASH of the base commit which you should use to rebase your feature from. + For example: ``5abce471e0690c6b8d06ca25685b0845c5fd270f``. You can also find this commit hash manually if you want + better control. + Run: -Step 5: Pass PR Review ----------------------- + ``git log`` -.. image:: images/review.png - :align: center - :alt: PR Review + And find the first commit that you DO NOT want to "transplant". -Note that committers will use **Squash and Merge** instead of **Rebase and Merge** -when merging PRs and your commit will be squashed to single commit. + Performing: -How to rebase PR -================ + ``git rebase HASH`` -A lot of people are unfamiliar with rebase workflow in Git, but we think it is an excellent workflow, -much better than merge workflow, so here is a short guide for those who would like to learn it. It's really -worth to spend a few minutes learning it. As opposed to merge workflow, the rebase workflow allows to -clearly separate your changes from changes of others, puts responsibility of proper rebase on the -author of the change. It also produces a "single-line" series of commits in master branch which -makes it much easier to understand what was going on and to find reasons for problems (it is especially -useful for "bisecting" when looking for a commit that introduced some bugs. + Will "transplant" all commits after the commit with the HASH. +4. Check out your feature branch locally via: -First of all - you can read about rebase workflow here: -`Merging vs. rebasing `_ - this is an -excellent article that describes all ins/outs of rebase. I recommend reading it and keeping it as reference. + ``git checkout my-branch`` -The goal of rebasing your PR on top of ``apache/master`` is to "transplant" your change on top of -the latest changes that are merged by others. It also allows you to fix all the conflicts -that are result of other people changing the same files as you and merging the changes to ``apache/master``. +5. Rebase: -Here is how rebase looks in practice: + ``git rebase HASH --onto apache/master`` -1. You need to add Apache remote to your git repository. You can add it as "apache" remote so that - you can refer to it easily: + For example: -``git remote add apache git@github.com:apache/airflow.git`` if you use ssh or -``git remote add apache https://github.com/apache/airflow.git`` if you use https. + ``git rebase 5abce471e0690c6b8d06ca25685b0845c5fd270f --onto apache/master`` -Later on +6. If you have no conflicts - that's cool. You rebased. You can now run ``git push --force-with-lease`` to + push your changes to your repository. That should trigger the build in our CI if you have a + Pull Request (PR) opened already. -2. You need to make sure that you have the latest master fetched from ``apache`` repository. You can do it - by ``git fetch apache`` for apache remote or ``git fetch --all`` to fetch all remotes. +7. While rebasing you might have conflicts. Read carefully what git tells you when it prints information + about the conflicts. You need to solve the conflicts manually. This is sometimes the most difficult + part and requires deliberately correcting your code and looking at what has changed since you developed your + changes. -3. Assuming that your feature is in a branch in your repository called ``my-branch`` you can check easily - what is the base commit you should rebase from by: ``git merge-base my-branch apache/master``. - This will print the HASH of the base commit which you should use to rebase your feature from - - for example: ``5abce471e0690c6b8d06ca25685b0845c5fd270f``. You can also find this commit hash manually - - if you want better control. Run ``git log`` and find the first commit that you DO NOT want to "transplant". - ``git rebase HASH`` will "trasplant" all commits after the commit with the HASH. + There are various tools that can help you with this. You can use: -4. Make sure you checked out your branch locally: + ``git mergetool`` -``git checkout my-branch`` + You can configure different merge tools with it. You can also use IntelliJ/PyCharm's excellent merge tool. + When you open a project in PyCharm which has conflicts, you can go to VCS > Git > Resolve Conflicts and there + you have a very intuitive and helpful merge tool. For more information, see + `Resolve conflicts `_. -5. Rebase: - Run: ``git rebase HASH --onto apache/master`` - for example: ``git rebase 5abce471e0690c6b8d06ca25685b0845c5fd270f --onto apache/master`` +8. After you've solved your conflict run: -6. If you have no conflicts - that's cool. You rebased. You can now run ``git push --force-with-lease`` to - push your changes to your repository. That should trigger the build in CI if you have a - Pull Request opened already. + ``git rebase --continue`` -7. While rebasing you might have conflicts. Read carefully what git tells you when it prints information - about the conflicts. You need to solve the conflicts manually. This is sometimes the most difficult - part and requires deliberate correcting your code looking what has changed since you developed your - changes. There are various tools that can help you with that. You can use ``git mergetool`` (and you can - configure different merge tools with it). Also you can use IntelliJ/PyCharm excellent merge tool. - When you open project in PyCharm which has conflict you can go to VCS->Git->Resolve Conflicts and there - you have a very intuitive and helpful merge tool. You can see more information - about it in `Resolve conflicts `_ - -8. After you solved conflicts simply run ``git rebase --continue`` and go either to point 6. or 7. - above depending if you have more commits that cause conflicts in your PR (rebasing applies each + And go either to point 6. or 7, depending on whether you have more commits that cause conflicts in your PR (rebasing applies each commit from your PR one-by-one). How to communicate @@ -887,6 +1195,8 @@ community are far more important than their contribution. This means that communication plays a big role in it, and this chapter is all about it. +In our communication, everyone is expected to follow the `ASF Code of Conduct `_. + We have various channels of communication - starting from the official devlist, comments in the Pull Requests, Slack, wiki. @@ -902,10 +1212,17 @@ You can join the channels via links at the `Airflow Community page `_ for: +* GitHub `Pull Requests (PRs) `_ for: * discussing implementation details of PRs * not for architectural discussions (use the devlist for that) -* The `Apache Airflow Slack `_ for: +* The deprecated `JIRA issues `_ for: + * checking out old but still valuable issues that are not on GitHub yet + * mentioning the JIRA issue number in the title of the related PR you would like to open on GitHub + +**IMPORTANT** +We don't create new issues on JIRA anymore. The reason we still look at JIRA issues is that there are valuable tickets inside of it. However, each new PR should be created on `GitHub issues `_ as stated in `Contribution Workflow Example `_ + +* The `Apache Airflow Slack `_ for: * ad-hoc questions related to development (#development channel) * asking for review (#development channel) * asking for help with PRs (#how-to-pr channel) @@ -978,6 +1295,33 @@ Here are a few rules that are important to keep in mind when you enter our commu * It’s OK to express your own emotions while communicating - it helps other people to understand you * Be considerate for feelings of others. Tell about how you feel not what you think of others +Committer Responsibilities +========================== + +Committers are more than contributors. While it's important for committers to maintain standing by +committing code, their key role is to build and foster a healthy and active community. +This means that committers should: + +* Review PRs in a timely and reliable fashion +* They should also help to actively whittle down the PR backlog +* Answer questions (i.e. on the dev list, in PRs, in GitHub Issues, slack, etc...) +* Take on core changes/bugs/feature requests +* Some changes are important enough that a committer needs to ensure it gets done. This is especially + the case if no one from the community is taking it on. +* Improve processes and tooling +* Refactoring code + +Commit Policy +============= + +The following commit policy passed by a vote 8(binding FOR) to 0 against on May 27, 2016 on the dev list +and slightly modified and consensus reached in October 2020: + +* Commits need a +1 vote from a committer who is not the author +* Do not merge a PR that regresses linting or does not pass CI tests (unless we have + justification such as clearly transient error). +* When we do AIP voting, both PMC and committer +1s are considered as binding vote. + Resources & Links ================= - `Airflow’s official documentation `__ diff --git a/Dockerfile b/Dockerfile index dd20b7e5c7e84..a34b63e668600 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,43 +34,41 @@ # much smaller. # ARG AIRFLOW_VERSION="2.0.0.dev0" -ARG WWW_FOLDER="www" - ARG AIRFLOW_EXTRAS="async,aws,azure,celery,dask,elasticsearch,gcp,kubernetes,mysql,postgres,redis,slack,ssh,statsd,virtualenv" +ARG ADDITIONAL_AIRFLOW_EXTRAS="" +ARG ADDITIONAL_PYTHON_DEPS="" ARG AIRFLOW_HOME=/opt/airflow ARG AIRFLOW_UID="50000" ARG AIRFLOW_GID="50000" -ARG PIP_VERSION="19.0.2" ARG CASS_DRIVER_BUILD_CONCURRENCY="8" ARG PYTHON_BASE_IMAGE="python:3.6-slim-buster" ARG PYTHON_MAJOR_MINOR_VERSION="3.6" +ARG PIP_VERSION=20.2.4 + ############################################################################################## # This is the build image where we build all dependencies ############################################################################################## FROM ${PYTHON_BASE_IMAGE} as airflow-build-image SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"] -LABEL org.apache.airflow.distro="debian" -LABEL org.apache.airflow.distro.version="buster" -LABEL org.apache.airflow.module="airflow" -LABEL org.apache.airflow.component="airflow" -LABEL org.apache.airflow.image="airflow-build-image" - ARG PYTHON_BASE_IMAGE ENV PYTHON_BASE_IMAGE=${PYTHON_BASE_IMAGE} ARG PYTHON_MAJOR_MINOR_VERSION ENV PYTHON_MAJOR_MINOR_VERSION=${PYTHON_MAJOR_MINOR_VERSION} +ARG PIP_VERSION +ENV PIP_VERSION=${PIP_VERSION} + # Make sure noninteractive debian install is used and language variables set ENV DEBIAN_FRONTEND=noninteractive LANGUAGE=C.UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 \ LC_CTYPE=C.UTF-8 LC_MESSAGES=C.UTF-8 -# Install curl and gnupg2 - needed to download nodejs in the next step +# Install curl and gnupg2 - needed for many other installation steps RUN apt-get update \ && apt-get install -y --no-install-recommends \ curl \ @@ -79,82 +77,126 @@ RUN apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Install basic apt dependencies -RUN curl --fail --location https://deb.nodesource.com/setup_10.x | bash - \ +ARG DEV_APT_DEPS="\ + apt-transport-https \ + apt-utils \ + build-essential \ + ca-certificates \ + gnupg \ + dirmngr \ + freetds-bin \ + freetds-dev \ + gosu \ + krb5-user \ + ldap-utils \ + libffi-dev \ + libkrb5-dev \ + libpq-dev \ + libsasl2-2 \ + libsasl2-dev \ + libsasl2-modules \ + libssl-dev \ + locales \ + lsb-release \ + nodejs \ + openssh-client \ + postgresql-client \ + python-selinux \ + sasl2-bin \ + software-properties-common \ + sqlite3 \ + sudo \ + unixodbc \ + unixodbc-dev \ + yarn" +ENV DEV_APT_DEPS=${DEV_APT_DEPS} + +ARG ADDITIONAL_DEV_APT_DEPS="" +ENV ADDITIONAL_DEV_APT_DEPS=${ADDITIONAL_DEV_APT_DEPS} + +ARG DEV_APT_COMMAND="\ + curl --fail --location https://deb.nodesource.com/setup_10.x | bash - \ && curl https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - > /dev/null \ - && echo "deb https://dl.yarnpkg.com/debian/ stable main" > /etc/apt/sources.list.d/yarn.list \ - # Note missing man directories on debian-buster - # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=863199 - && mkdir -pv /usr/share/man/man1 \ + && echo 'deb https://dl.yarnpkg.com/debian/ stable main' > /etc/apt/sources.list.d/yarn.list" +ENV DEV_APT_COMMAND=${DEV_APT_COMMAND} + +ARG ADDITIONAL_DEV_APT_COMMAND="echo" +ENV ADDITIONAL_DEV_APT_COMMAND=${ADDITIONAL_DEV_APT_COMMAND} + +ARG ADDITIONAL_DEV_ENV_VARS="" + +# Note missing man directories on debian-buster +# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=863199 +# Install basic and additional apt dependencies +RUN mkdir -pv /usr/share/man/man1 \ && mkdir -pv /usr/share/man/man7 \ + && export ${ADDITIONAL_DEV_ENV_VARS?} \ + && bash -o pipefail -e -u -x -c "${DEV_APT_COMMAND}" \ + && bash -o pipefail -e -u -x -c "${ADDITIONAL_DEV_APT_COMMAND}" \ && apt-get update \ && apt-get install -y --no-install-recommends \ - apt-transport-https \ - apt-utils \ - build-essential \ - ca-certificates \ - curl \ - gnupg \ - dirmngr \ - freetds-bin \ - freetds-dev \ - gosu \ - krb5-user \ - ldap-utils \ - libffi-dev \ - libkrb5-dev \ - libpq-dev \ - libsasl2-2 \ - libsasl2-dev \ - libsasl2-modules \ - libssl-dev \ - locales \ - lsb-release \ - nodejs \ - openssh-client \ - postgresql-client \ - python-selinux \ - sasl2-bin \ - software-properties-common \ - sqlite3 \ - sudo \ - unixodbc \ - unixodbc-dev \ - yarn \ + ${DEV_APT_DEPS} \ + ${ADDITIONAL_DEV_APT_DEPS} \ && apt-get autoremove -yqq --purge \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Install MySQL client from Oracle repositories (Debian installs mariadb) -RUN KEY="A4A9406876FCBD3C456770C88C718D3B5072E1F5" \ - && GNUPGHOME="$(mktemp -d)" \ - && export GNUPGHOME \ - && for KEYSERVER in $(shuf -e \ - ha.pool.sks-keyservers.net \ - hkp://p80.pool.sks-keyservers.net:80 \ - keyserver.ubuntu.com \ - hkp://keyserver.ubuntu.com:80 \ - pgp.mit.edu) ; do \ - gpg --keyserver "${KEYSERVER}" --recv-keys "${KEY}" && break || true ; \ - done \ - && gpg --export "${KEY}" | apt-key add - \ - && gpgconf --kill all \ - rm -rf "${GNUPGHOME}"; \ - apt-key list > /dev/null \ - && echo "deb http://repo.mysql.com/apt/debian/ stretch mysql-5.7" | tee -a /etc/apt/sources.list.d/mysql.list \ - && apt-get update \ - && apt-get install --no-install-recommends -y \ - libmysqlclient-dev \ - mysql-client \ - && apt-get autoremove -yqq --purge \ - && apt-get clean && rm -rf /var/lib/apt/lists/* +ARG INSTALL_MYSQL_CLIENT="true" +ENV INSTALL_MYSQL_CLIENT=${INSTALL_MYSQL_CLIENT} -ARG PIP_VERSION -ENV PIP_VERSION=${PIP_VERSION} +COPY scripts/docker /scripts/docker +COPY docker-context-files /docker-context-files +# fix permission issue in Azure DevOps when running the script +RUN chmod a+x /scripts/docker/install_mysql.sh +RUN ./scripts/docker/install_mysql.sh dev + +ARG AIRFLOW_REPO=apache/airflow +ENV AIRFLOW_REPO=${AIRFLOW_REPO} + +ARG AIRFLOW_BRANCH=master +ENV AIRFLOW_BRANCH=${AIRFLOW_BRANCH} + +ARG AIRFLOW_EXTRAS +ARG ADDITIONAL_AIRFLOW_EXTRAS="" +ENV AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS}${ADDITIONAL_AIRFLOW_EXTRAS:+,}${ADDITIONAL_AIRFLOW_EXTRAS} + +ARG AIRFLOW_CONSTRAINTS_REFERENCE="constraints-master" +ARG AIRFLOW_CONSTRAINTS_LOCATION="https://raw.githubusercontent.com/apache/airflow/${AIRFLOW_CONSTRAINTS_REFERENCE}/constraints-${PYTHON_MAJOR_MINOR_VERSION}.txt" +ENV AIRFLOW_CONSTRAINTS_LOCATION=${AIRFLOW_CONSTRAINTS_LOCATION} + +ENV PATH=${PATH}:/root/.local/bin +RUN mkdir -p /root/.local/bin -RUN pip install --upgrade pip==${PIP_VERSION} +RUN if [[ -f /docker-context-files/.pypirc ]]; then \ + cp /docker-context-files/.pypirc /root/.pypirc; \ + fi + +RUN pip install --upgrade "pip==${PIP_VERSION}" + +# By default we do not use pre-cached packages, but in CI/Breeze environment we override this to speed up +# builds in case setup.py/setup.cfg changed. This is pure optimisation of CI/Breeze builds. +ARG AIRFLOW_PRE_CACHED_PIP_PACKAGES="false" +ENV AIRFLOW_PRE_CACHED_PIP_PACKAGES=${AIRFLOW_PRE_CACHED_PIP_PACKAGES} + +# In case of Production build image segment we want to pre-install master version of airflow +# dependencies from GitHub so that we do not have to always reinstall it from the scratch. +RUN if [[ ${AIRFLOW_PRE_CACHED_PIP_PACKAGES} == "true" ]]; then \ + if [[ ${INSTALL_MYSQL_CLIENT} != "true" ]]; then \ + AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS/mysql,}; \ + fi; \ + pip install --user \ + "https://github.com/${AIRFLOW_REPO}/archive/${AIRFLOW_BRANCH}.tar.gz#egg=apache-airflow[${AIRFLOW_EXTRAS}]" \ + --constraint "${AIRFLOW_CONSTRAINTS_LOCATION}" \ + && pip uninstall --yes apache-airflow; \ + fi + +# By default we install latest airflow from PyPI so we do not need to copy sources of Airflow +# but in case of breeze/CI builds we use latest sources and we override those +# those SOURCES_FROM/TO with "." and "/opt/airflow" respectively +ARG AIRFLOW_SOURCES_FROM="empty" +ENV AIRFLOW_SOURCES_FROM=${AIRFLOW_SOURCES_FROM} -ARG AIRFLOW_SOURCES_TO="/opt/airflow" +ARG AIRFLOW_SOURCES_TO="/empty" ENV AIRFLOW_SOURCES_TO=${AIRFLOW_SOURCES_TO} COPY ${AIRFLOW_SOURCES_FROM} ${AIRFLOW_SOURCES_TO} @@ -162,53 +204,96 @@ COPY ${AIRFLOW_SOURCES_FROM} ${AIRFLOW_SOURCES_TO} ARG CASS_DRIVER_BUILD_CONCURRENCY ENV CASS_DRIVER_BUILD_CONCURRENCY=${CASS_DRIVER_BUILD_CONCURRENCY} +# This is airflow version that is put in the label of the image build ARG AIRFLOW_VERSION ENV AIRFLOW_VERSION=${AIRFLOW_VERSION} -ARG AIRFLOW_EXTRAS -ENV AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS} +ARG ADDITIONAL_PYTHON_DEPS="" +ENV ADDITIONAL_PYTHON_DEPS=${ADDITIONAL_PYTHON_DEPS} -ARG AIRFLOW_INSTALL_SOURCES="." -ENV AIRFLOW_INSTALL_SOURCES=${AIRFLOW_INSTALL_SOURCES} +# Determines the way airflow is installed. By default we install airflow from PyPI `apache-airflow` package +# But it also can be `.` from local installation or GitHub URL pointing to specific branch or tag +# Of Airflow. Note That for local source installation you need to have local sources of +# Airflow checked out together with the Dockerfile and AIRFLOW_SOURCES_FROM and AIRFLOW_SOURCES_TO +# set to "." and "/opt/airflow" respectively. +ARG AIRFLOW_INSTALLATION_METHOD="apache-airflow" +ENV AIRFLOW_INSTALLATION_METHOD=${AIRFLOW_INSTALLATION_METHOD} +# By default latest released version of airflow is installed (when empty) but this value can be overriden +# and we can install specific version of airflow this way. ARG AIRFLOW_INSTALL_VERSION="" ENV AIRFLOW_INSTALL_VERSION=${AIRFLOW_INSTALL_VERSION} -ARG CONSTRAINT_REQUIREMENTS="requirements/requirements-python${PYTHON_MAJOR_MINOR_VERSION}.txt" -ENV CONSTRAINT_REQUIREMENTS=${CONSTRAINT_REQUIREMENTS} - -ARG AIRFLOW_SOURCES_FROM="." -ENV AIRFLOW_SOURCES_FROM=${AIRFLOW_SOURCES_FROM} - -WORKDIR /opt/airflow +# We can seet this value to true in case we want to install .whl .tar.gz packages placed in the +# docker-context-files folder. This can be done for both - additional packages you want to install +# and for airflow as well (you have to set INSTALL_FROM_PYPI to false in this case) +ARG INSTALL_FROM_DOCKER_CONTEXT_FILES="" +ENV INSTALL_FROM_DOCKER_CONTEXT_FILES=${INSTALL_FROM_DOCKER_CONTEXT_FILES} -# hadolint ignore=DL3020 -ADD "${CONSTRAINT_REQUIREMENTS}" /requirements.txt +# By default we install latest airflow from PyPI. You can set it to false if you want to install +# Airflow from the .whl or .tar.gz packages placed in `docker-context-files` folder. +ARG INSTALL_FROM_PYPI="true" +ENV INSTALL_FROM_PYPI=${INSTALL_FROM_PYPI} -ENV PATH=${PATH}:/root/.local/bin +# By default we install providers from PyPI but in case of Breze build we want to install providers +# from local sources without the neeed of preparing provider packages upfront. This value is +# automatically overridden by Breeze scripts. +ARG INSTALL_PROVIDERS_FROM_SOURCES="false" +ENV INSTALL_PROVIDERS_FROM_SOURCES=${INSTALL_PROVIDERS_FROM_SOURCES} -RUN pip install --user "${AIRFLOW_INSTALL_SOURCES}[${AIRFLOW_EXTRAS}]${AIRFLOW_INSTALL_VERSION}" \ - --constraint /requirements.txt && \ - find /root/.local/ -name '*.pyc' -print0 | xargs -0 rm -r && \ - find /root/.local/ -type d -name '__pycache__' -print0 | xargs -0 rm -r +WORKDIR /opt/airflow +# remove mysql from extras if client is not installed +RUN if [[ ${INSTALL_MYSQL_CLIENT} != "true" ]]; then \ + AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS/mysql,}; \ + fi; \ + if [[ ${INSTALL_FROM_PYPI} == "true" ]]; then \ + pip install --user "${AIRFLOW_INSTALLATION_METHOD}[${AIRFLOW_EXTRAS}]${AIRFLOW_INSTALL_VERSION}" \ + --constraint "${AIRFLOW_CONSTRAINTS_LOCATION}"; \ + fi; \ + if [[ -n "${ADDITIONAL_PYTHON_DEPS}" ]]; then \ + pip install --user ${ADDITIONAL_PYTHON_DEPS} --constraint "${AIRFLOW_CONSTRAINTS_LOCATION}"; \ + fi; \ + if [[ ${INSTALL_FROM_DOCKER_CONTEXT_FILES} == "true" ]]; then \ + if ls /docker-context-files/*.{whl,tar.gz} 1> /dev/null 2>&1; then \ + pip install --user --no-deps /docker-context-files/*.{whl,tar.gz}; \ + fi ; \ + fi; \ + find /root/.local/ -name '*.pyc' -print0 | xargs -0 rm -r || true ; \ + find /root/.local/ -type d -name '__pycache__' -print0 | xargs -0 rm -r || true + +RUN AIRFLOW_SITE_PACKAGE="/root/.local/lib/python${PYTHON_MAJOR_MINOR_VERSION}/site-packages/airflow"; \ + if [[ -f "${AIRFLOW_SITE_PACKAGE}/www_rbac/package.json" ]]; then \ + WWW_DIR="${AIRFLOW_SITE_PACKAGE}/www_rbac"; \ + elif [[ -f "${AIRFLOW_SITE_PACKAGE}/www/package.json" ]]; then \ + WWW_DIR="${AIRFLOW_SITE_PACKAGE}/www"; \ + fi; \ + if [[ ${WWW_DIR:=} != "" ]]; then \ + yarn --cwd "${WWW_DIR}" install --frozen-lockfile --no-cache; \ + yarn --cwd "${WWW_DIR}" run prod; \ + rm -rf "${WWW_DIR}/node_modules"; \ + rm -vf "${WWW_DIR}"/{package.json,yarn.lock,.eslintignore,.eslintrc,.stylelintignore,.stylelintrc,compile_assets.sh,webpack.config.js} ;\ + fi -ARG WWW_FOLDER -ENV WWW_FOLDER=${WWW_FOLDER} +# make sure that all directories and files in .local are also group accessible +RUN find /root/.local -executable -print0 | xargs --null chmod g+x && \ + find /root/.local -print0 | xargs --null chmod g+rw -ENV AIRFLOW_WWW=/root/.local/lib/python${PYTHON_MAJOR_MINOR_VERSION}/site-packages/airflow/${WWW_FOLDER} -RUN if [[ -f "${AIRFLOW_WWW}/package.json" ]]; then \ - yarn --cwd ${AIRFLOW_WWW} install --frozen-lockfile --no-cache; \ - yarn --cwd ${AIRFLOW_WWW} run prod; \ - rm -rf ${AIRFLOW_WWW}/node_modules; \ - fi +ARG BUILD_ID +ENV BUILD_ID=${BUILD_ID} +ARG COMMIT_SHA +ENV COMMIT_SHA=${COMMIT_SHA} -ARG ENTRYPOINT_FILE="entrypoint.sh" -ENV ENTRYPOINT_FILE="${ENTRYPOINT_FILE}" -# hadolint ignore=DL3020 -ADD ${ENTRYPOINT_FILE} /entrypoint +LABEL org.apache.airflow.distro="debian" \ + org.apache.airflow.distro.version="buster" \ + org.apache.airflow.module="airflow" \ + org.apache.airflow.component="airflow" \ + org.apache.airflow.image="airflow-build-image" \ + org.apache.airflow.version="${AIRFLOW_VERSION}" \ + org.apache.airflow.buildImage.buildId=${BUILD_ID} \ + org.apache.airflow.buildImage.commitSha=${COMMIT_SHA} ############################################################################################## # This is the actual Airflow image - much smaller than the build one. We copy @@ -220,13 +305,13 @@ SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"] ARG AIRFLOW_UID ARG AIRFLOW_GID -LABEL org.apache.airflow.distro="debian" -LABEL org.apache.airflow.distro.version="buster" -LABEL org.apache.airflow.module="airflow" -LABEL org.apache.airflow.component="airflow" -LABEL org.apache.airflow.image="airflow" -LABEL org.apache.airflow.uid="${AIRFLOW_UID}" -LABEL org.apache.airflow.gid="${AIRFLOW_GID}" +LABEL org.apache.airflow.distro="debian" \ + org.apache.airflow.distro.version="buster" \ + org.apache.airflow.module="airflow" \ + org.apache.airflow.component="airflow" \ + org.apache.airflow.image="airflow" \ + org.apache.airflow.uid="${AIRFLOW_UID}" \ + org.apache.airflow.gid="${AIRFLOW_GID}" ARG PYTHON_BASE_IMAGE ENV PYTHON_BASE_IMAGE=${PYTHON_BASE_IMAGE} @@ -238,95 +323,141 @@ ENV AIRFLOW_VERSION=${AIRFLOW_VERSION} ENV DEBIAN_FRONTEND=noninteractive LANGUAGE=C.UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 \ LC_CTYPE=C.UTF-8 LC_MESSAGES=C.UTF-8 +ARG PIP_VERSION +ENV PIP_VERSION=${PIP_VERSION} + +# Install curl and gnupg2 - needed for many other installation steps +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + curl \ + gnupg2 \ + && apt-get autoremove -yqq --purge \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ARG RUNTIME_APT_DEPS="\ + apt-transport-https \ + apt-utils \ + ca-certificates \ + curl \ + dumb-init \ + freetds-bin \ + gnupg \ + gosu \ + krb5-user \ + ldap-utils \ + libffi6 \ + libsasl2-2 \ + libsasl2-modules \ + libssl1.1 \ + locales \ + lsb-release \ + netcat \ + openssh-client \ + postgresql-client \ + rsync \ + sasl2-bin \ + sqlite3 \ + sudo \ + unixodbc" +ENV RUNTIME_APT_DEPS=${RUNTIME_APT_DEPS} + +ARG ADDITIONAL_RUNTIME_APT_DEPS="" +ENV ADDITIONAL_RUNTIME_APT_DEPS=${ADDITIONAL_RUNTIME_APT_DEPS} + +ARG RUNTIME_APT_COMMAND="echo" +ENV RUNTIME_APT_COMMAND=${RUNTIME_APT_COMMAND} + +ARG ADDITIONAL_RUNTIME_APT_COMMAND="" +ENV ADDITIONAL_RUNTIME_APT_COMMAND=${ADDITIONAL_RUNTIME_APT_COMMAND} + +ARG ADDITIONAL_RUNTIME_ENV_VARS="" + # Note missing man directories on debian-buster # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=863199 -# Install basic apt dependencies +# Install basic and additional apt dependencies RUN mkdir -pv /usr/share/man/man1 \ && mkdir -pv /usr/share/man/man7 \ + && export ${ADDITIONAL_RUNTIME_ENV_VARS?} \ + && bash -o pipefail -e -u -x -c "${RUNTIME_APT_COMMAND}" \ + && bash -o pipefail -e -u -x -c "${ADDITIONAL_RUNTIME_APT_COMMAND}" \ && apt-get update \ && apt-get install -y --no-install-recommends \ - apt-transport-https \ - apt-utils \ - ca-certificates \ - curl \ - dumb-init \ - freetds-bin \ - gnupg \ - gosu \ - krb5-user \ - ldap-utils \ - libffi6 \ - libsasl2-2 \ - libsasl2-modules \ - libssl1.1 \ - locales \ - lsb-release \ - netcat \ - openssh-client \ - postgresql-client \ - rsync \ - sasl2-bin \ - sqlite3 \ - sudo \ - unixodbc \ + ${RUNTIME_APT_DEPS} \ + ${ADDITIONAL_RUNTIME_APT_DEPS} \ && apt-get autoremove -yqq --purge \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Install MySQL client from Oracle repositories (Debian installs mariadb) -RUN KEY="A4A9406876FCBD3C456770C88C718D3B5072E1F5" \ - && GNUPGHOME="$(mktemp -d)" \ - && export GNUPGHOME \ - && for KEYSERVER in $(shuf -e \ - ha.pool.sks-keyservers.net \ - hkp://p80.pool.sks-keyservers.net:80 \ - keyserver.ubuntu.com \ - hkp://keyserver.ubuntu.com:80 \ - pgp.mit.edu) ; do \ - gpg --keyserver "${KEYSERVER}" --recv-keys "${KEY}" && break || true ; \ - done \ - && gpg --export "${KEY}" | apt-key add - \ - && gpgconf --kill all \ - rm -rf "${GNUPGHOME}"; \ - apt-key list > /dev/null \ - && echo "deb http://repo.mysql.com/apt/debian/ stretch mysql-5.7" | tee -a /etc/apt/sources.list.d/mysql.list \ - && apt-get update \ - && apt-get install --no-install-recommends -y \ - libmysqlclient20 \ - mysql-client \ - && apt-get autoremove -yqq --purge \ - && apt-get clean && rm -rf /var/lib/apt/lists/* +ARG INSTALL_MYSQL_CLIENT="true" +ENV INSTALL_MYSQL_CLIENT=${INSTALL_MYSQL_CLIENT} -ARG PIP_VERSION -ENV PIP_VERSION=${PIP_VERSION} -RUN pip install --upgrade pip==${PIP_VERSION} +COPY scripts/docker /scripts/docker +# fix permission issue in Azure DevOps when running the script +RUN chmod a+x /scripts/docker/install_mysql.sh +RUN ./scripts/docker/install_mysql.sh prod ENV AIRFLOW_UID=${AIRFLOW_UID} ENV AIRFLOW_GID=${AIRFLOW_GID} +ENV AIRFLOW__CORE__LOAD_EXAMPLES="false" + +ARG AIRFLOW_USER_HOME_DIR=/home/airflow +ENV AIRFLOW_USER_HOME_DIR=${AIRFLOW_USER_HOME_DIR} + RUN addgroup --gid "${AIRFLOW_GID}" "airflow" && \ adduser --quiet "airflow" --uid "${AIRFLOW_UID}" \ - --ingroup "airflow" \ - --home /home/airflow + --gid "${AIRFLOW_GID}" \ + --home "${AIRFLOW_USER_HOME_DIR}" ARG AIRFLOW_HOME ENV AIRFLOW_HOME=${AIRFLOW_HOME} +# Make Airflow files belong to the root group and are accessible. This is to accomodate the guidelines from +# OpenShift https://docs.openshift.com/enterprise/3.0/creating_images/guidelines.html RUN mkdir -pv "${AIRFLOW_HOME}"; \ mkdir -pv "${AIRFLOW_HOME}/dags"; \ mkdir -pv "${AIRFLOW_HOME}/logs"; \ - chown -R "airflow" "${AIRFLOW_HOME}" + chown -R "airflow:root" "${AIRFLOW_USER_HOME_DIR}" "${AIRFLOW_HOME}"; \ + find "${AIRFLOW_HOME}" -executable -print0 | xargs --null chmod g+x && \ + find "${AIRFLOW_HOME}" -print0 | xargs --null chmod g+rw + +COPY --chown=airflow:root --from=airflow-build-image /root/.local "${AIRFLOW_USER_HOME_DIR}/.local" + +COPY --chown=airflow:root scripts/in_container/prod/entrypoint_prod.sh /entrypoint +COPY --chown=airflow:root scripts/in_container/prod/clean-logs.sh /clean-logs +RUN chmod a+x /entrypoint /clean-logs -COPY --chown=airflow:airflow --from=airflow-build-image /root/.local "/home/airflow/.local" -COPY --chown=airflow:airflow --from=airflow-build-image /entrypoint /entrypoint +RUN pip install --upgrade "pip==${PIP_VERSION}" -USER airflow +# Make /etc/passwd root-group-writeable so that user can be dynamically added by OpenShift +# See https://github.com/apache/airflow/issues/9248 +RUN chmod g=u /etc/passwd -ENV PATH="/home/airflow/.local/bin:${PATH}" +ENV PATH="${AIRFLOW_USER_HOME_DIR}/.local/bin:${PATH}" +ENV GUNICORN_CMD_ARGS="--worker-tmp-dir /dev/shm" WORKDIR ${AIRFLOW_HOME} -ENV AIRFLOW__CORE__LOAD_EXAMPLES="false" +EXPOSE 8080 + +USER ${AIRFLOW_UID} + +ARG BUILD_ID +ENV BUILD_ID=${BUILD_ID} +ARG COMMIT_SHA +ENV COMMIT_SHA=${COMMIT_SHA} + +LABEL org.apache.airflow.distro="debian" \ + org.apache.airflow.distro.version="buster" \ + org.apache.airflow.module="airflow" \ + org.apache.airflow.component="airflow" \ + org.apache.airflow.image="airflow" \ + org.apache.airflow.version="${AIRFLOW_VERSION}" \ + org.apache.airflow.uid="${AIRFLOW_UID}" \ + org.apache.airflow.gid="${AIRFLOW_GID}" \ + org.apache.airflow.mainImage.buildId=${BUILD_ID} \ + org.apache.airflow.mainImage.commitSha=${COMMIT_SHA} ENTRYPOINT ["/usr/bin/dumb-init", "--", "/entrypoint"] -CMD ["airflow", "--help"] +CMD ["--help"] diff --git a/Dockerfile.ci b/Dockerfile.ci index 3b99c5b6baa97..9a39aa1972976 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -29,8 +29,8 @@ ENV AIRFLOW_VERSION=$AIRFLOW_VERSION ARG PYTHON_MAJOR_MINOR_VERSION="3.6" ENV PYTHON_MAJOR_MINOR_VERSION=${PYTHON_MAJOR_MINOR_VERSION} -ARG UPGRADE_TO_LATEST_REQUIREMENTS="false" -ENV UPGRADE_TO_LATEST_REQUIREMENTS=${UPGRADE_TO_LATEST_REQUIREMENTS} +ARG PIP_VERSION=20.2.4 +ENV PIP_VERSION=${PIP_VERSION} # Print versions RUN echo "Base image: ${PYTHON_BASE_IMAGE}" @@ -41,8 +41,8 @@ ENV DEBIAN_FRONTEND=noninteractive LANGUAGE=C.UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 LC_CTYPE=C.UTF-8 LC_MESSAGES=C.UTF-8 # By increasing this number we can do force build of all dependencies -ARG DEPENDENCIES_EPOCH_NUMBER="1" -# Increase the value below to force renstalling of all dependencies +ARG DEPENDENCIES_EPOCH_NUMBER="4" +# Increase the value below to force reinstalling of all dependencies ENV DEPENDENCIES_EPOCH_NUMBER=${DEPENDENCIES_EPOCH_NUMBER} # Install curl and gnupg2 - needed to download nodejs in the next step @@ -54,10 +54,26 @@ RUN apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Install basic apt dependencies -RUN curl --fail --location https://deb.nodesource.com/setup_10.x | bash - \ +ARG ADDITIONAL_DEV_APT_DEPS="" +ENV ADDITIONAL_DEV_APT_DEPS=${ADDITIONAL_DEV_APT_DEPS} + +ARG DEV_APT_COMMAND="\ + curl --fail --location https://deb.nodesource.com/setup_10.x | bash - \ && curl https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - > /dev/null \ - && echo "deb https://dl.yarnpkg.com/debian/ stable main" > /etc/apt/sources.list.d/yarn.list \ + && echo 'deb https://dl.yarnpkg.com/debian/ stable main' > /etc/apt/sources.list.d/yarn.list" +ENV DEV_APT_COMMAND=${DEV_APT_COMMAND} + +ARG ADDITIONAL_DEV_APT_COMMAND="" +ENV ADDITIONAL_DEV_APT_COMMAND=${ADDITIONAL_DEV_APT_COMMAND} + +ARG ADDITIONAL_DEV_ENV_VARS="" + +# Install basic and additional apt dependencies +RUN mkdir -pv /usr/share/man/man1 \ + && mkdir -pv /usr/share/man/man7 \ + && export ${ADDITIONAL_DEV_ENV_VARS?} \ + && bash -o pipefail -e -u -x -c "${DEV_APT_COMMAND}" \ + && bash -o pipefail -e -u -x -c "${ADDITIONAL_DEV_APT_COMMAND}" \ && apt-get update \ && apt-get install -y --no-install-recommends \ apt-utils \ @@ -76,55 +92,35 @@ RUN curl --fail --location https://deb.nodesource.com/setup_10.x | bash - \ libsasl2-dev \ libsasl2-modules \ libssl-dev \ + libenchant-dev \ locales \ netcat \ nodejs \ rsync \ sasl2-bin \ sudo \ + unixodbc \ + unixodbc-dev \ yarn \ + ${ADDITIONAL_DEV_APT_DEPS} \ && apt-get autoremove -yqq --purge \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Install MySQL client from Oracle repositories (Debian installs mariadb) -RUN KEY="A4A9406876FCBD3C456770C88C718D3B5072E1F5" \ - && GNUPGHOME="$(mktemp -d)" \ - && export GNUPGHOME \ - && for KEYSERVER in $(shuf -e \ - ha.pool.sks-keyservers.net \ - hkp://p80.pool.sks-keyservers.net:80 \ - keyserver.ubuntu.com \ - hkp://keyserver.ubuntu.com:80 \ - pgp.mit.edu) ; do \ - gpg --keyserver "${KEYSERVER}" --recv-keys "${KEY}" && break || true ; \ - done \ - && gpg --export "${KEY}" | apt-key add - \ - && gpgconf --kill all \ - rm -rf "${GNUPGHOME}"; \ - apt-key list > /dev/null \ - && echo "deb http://repo.mysql.com/apt/debian/ stretch mysql-5.6" | tee -a /etc/apt/sources.list.d/mysql.list \ - && apt-get update \ - && apt-get install --no-install-recommends -y \ - libmysqlclient-dev \ - mysql-client \ - && apt-get autoremove -yqq --purge \ - && apt-get clean && rm -rf /var/lib/apt/lists/* +COPY scripts/docker /scripts/docker +# fix permission issue in Azure DevOps when running the script +RUN chmod a+x /scripts/docker/install_mysql.sh +RUN ./scripts/docker/install_mysql.sh dev RUN adduser airflow \ && echo "airflow ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/airflow \ && chmod 0440 /etc/sudoers.d/airflow -# Note missing man directories on debian-buster -# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=863199 -RUN mkdir -pv /usr/share/man/man1 \ - && mkdir -pv /usr/share/man/man7 \ - && echo "deb http://ftp.us.debian.org/debian sid main" \ - > /etc/apt/sources.list.d/openjdk.list \ - && apt-get update \ - && apt-get install --no-install-recommends -y \ +# The latest buster images do not have libpython 2.7 installed and it is needed +# To run virtualenv tests with python 2 +ARG RUNTIME_APT_DEPS="\ gnupg \ - openjdk-8-jdk \ + libgcc-8-dev \ apt-transport-https \ bash-completion \ ca-certificates \ @@ -132,8 +128,6 @@ RUN mkdir -pv /usr/share/man/man1 \ krb5-user \ ldap-utils \ less \ - # The latest buster images do not have libpython 2.7 installed and it is needed - # To run virtualenv tests with python 2 libpython2.7-stdlib \ lsb-release \ net-tools \ @@ -144,75 +138,40 @@ RUN mkdir -pv /usr/share/man/man1 \ tmux \ unzip \ vim \ - && apt-get autoremove -yqq --purge \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* + xxd" +ENV RUNTIME_APT_DEP=${RUNTIME_APT_DEPS} -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ - -# Install Hadoop and Hive -# It is done in one step to share variables. -ENV HADOOP_HOME="/opt/hadoop-cdh" HIVE_HOME="/opt/hive" - -RUN HADOOP_DISTRO="cdh" \ - && HADOOP_MAJOR="5" \ - && HADOOP_DISTRO_VERSION="5.11.0" \ - && HADOOP_VERSION="2.6.0" \ - && HADOOP_URL="https://archive.cloudera.com/${HADOOP_DISTRO}${HADOOP_MAJOR}/${HADOOP_DISTRO}/${HADOOP_MAJOR}/"\ - && HADOOP_DOWNLOAD_URL="${HADOOP_URL}hadoop-${HADOOP_VERSION}-${HADOOP_DISTRO}${HADOOP_DISTRO_VERSION}.tar.gz" \ - && HADOOP_TMP_FILE="/tmp/hadoop.tar.gz" \ - && mkdir -pv "${HADOOP_HOME}" \ - && curl --fail --location "${HADOOP_DOWNLOAD_URL}" --output "${HADOOP_TMP_FILE}" \ - && tar xzf "${HADOOP_TMP_FILE}" --absolute-names --strip-components 1 -C "${HADOOP_HOME}" \ - && rm "${HADOOP_TMP_FILE}" \ - && echo "Installing Hive" \ - && HIVE_VERSION="1.1.0" \ - && HIVE_URL="${HADOOP_URL}hive-${HIVE_VERSION}-${HADOOP_DISTRO}${HADOOP_DISTRO_VERSION}.tar.gz" \ - && HIVE_VERSION="1.1.0" \ - && HIVE_TMP_FILE="/tmp/hive.tar.gz" \ - && mkdir -pv "${HIVE_HOME}" \ - && mkdir -pv "/user/hive/warehouse" \ - && chmod -R 777 "${HIVE_HOME}" \ - && chmod -R 777 "/user/" \ - && curl --fail --location "${HIVE_URL}" --output "${HIVE_TMP_FILE}" \ - && tar xzf "${HIVE_TMP_FILE}" --strip-components 1 -C "${HIVE_HOME}" \ - && rm "${HIVE_TMP_FILE}" - -ENV PATH "${PATH}:/opt/hive/bin" - -# Install Minicluster -ENV MINICLUSTER_HOME="/opt/minicluster" - -RUN MINICLUSTER_BASE="https://github.com/bolkedebruin/minicluster/releases/download/" \ - && MINICLUSTER_VER="1.1" \ - && MINICLUSTER_URL="${MINICLUSTER_BASE}${MINICLUSTER_VER}/minicluster-${MINICLUSTER_VER}-SNAPSHOT-bin.zip" \ - && MINICLUSTER_TMP_FILE="/tmp/minicluster.zip" \ - && mkdir -pv "${MINICLUSTER_HOME}" \ - && curl --fail --location "${MINICLUSTER_URL}" --output "${MINICLUSTER_TMP_FILE}" \ - && unzip "${MINICLUSTER_TMP_FILE}" -d "/opt" \ - && rm "${MINICLUSTER_TMP_FILE}" - -# Install Docker -RUN curl --fail --location https://download.docker.com/linux/debian/gpg | apt-key add - \ - && add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian stretch stable" \ - && apt-get update \ - && apt-get -y install --no-install-recommends docker-ce \ - && apt-get autoremove -yqq --purge \ - && apt-get clean && rm -rf /var/lib/apt/lists/* +ARG ADDITIONAL_RUNTIME_APT_DEPS="" +ENV ADDITIONAL_RUNTIME_APT_DEPS=${ADDITIONAL_RUNTIME_APT_DEPS} -# Install kubectl -ARG KUBECTL_VERSION="v1.15.3" +ARG RUNTIME_APT_COMMAND="" +ENV RUNTIME_APT_COMMAND=${RUNTIME_APT_COMMAND} -RUN KUBECTL_URL="https://storage.googleapis.com/kubernetes-release/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" \ - && curl --fail --location "${KUBECTL_URL}" --output "/usr/local/bin/kubectl" \ - && chmod +x /usr/local/bin/kubectl +ARG ADDITIONAL_RUNTIME_APT_COMMAND="" +ENV ADDITIONAL_RUNTIME_APT_COMMAND=${ADDITIONAL_RUNTIME_APT_COMMAND} + +ARG ADDITIONAL_RUNTIME_ENV_VARS="" + +# Note missing man directories on debian-buster +# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=863199 +RUN mkdir -pv /usr/share/man/man1 \ + && mkdir -pv /usr/share/man/man7 \ + && export ${ADDITIONAL_RUNTIME_ENV_VARS?} \ + && bash -o pipefail -e -u -x -c "${RUNTIME_APT_COMMAND}" \ + && bash -o pipefail -e -u -x -c "${ADDITIONAL_RUNTIME_APT_COMMAND}" \ + && apt-get update \ + && apt-get install --no-install-recommends -y \ + ${RUNTIME_APT_DEPS} \ + ${ADDITIONAL_RUNTIME_APT_DEPS} \ + && apt-get autoremove -yqq --purge \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* -# Install Kind -ARG KIND_VERSION="v0.6.1" +ARG DOCKER_CLI_VERSION=19.03.9 +ENV DOCKER_CLI_VERSION=${DOCKER_CLI_VERSION} -RUN KIND_URL="https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-amd64" \ - && curl --fail --location "${KIND_URL}" --output "/usr/local/bin/kind" \ - && chmod +x /usr/local/bin/kind +RUN curl https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_CLI_VERSION}.tgz \ + | tar -C /usr/bin --strip-components=1 -xvzf - docker/docker # Setup PIP # By default PIP install run without cache to make image smaller @@ -220,43 +179,6 @@ ARG PIP_NO_CACHE_DIR="true" ENV PIP_NO_CACHE_DIR=${PIP_NO_CACHE_DIR} RUN echo "Pip no cache dir: ${PIP_NO_CACHE_DIR}" -# PIP version used to install dependencies -ARG PIP_VERSION="19.0.2" -ENV PIP_VERSION=${PIP_VERSION} -RUN echo "Pip version: ${PIP_VERSION}" - -RUN pip install --upgrade pip==${PIP_VERSION} - -# Install Google SDK -ENV GCLOUD_HOME="/opt/gcloud" CLOUDSDK_PYTHON=python${PYTHON_MAJOR_MINOR_VERSION} - -RUN GCLOUD_VERSION="274.0.1" \ - && GCOUD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${GCLOUD_VERSION}-linux-x86_64.tar.gz" \ - && GCLOUD_TMP_FILE="/tmp/gcloud.tar.gz" \ - && export CLOUDSDK_CORE_DISABLE_PROMPTS=1 \ - && mkdir -p /opt/gcloud \ - && curl "${GCOUD_URL}" -o "${GCLOUD_TMP_FILE}"\ - && tar xzf "${GCLOUD_TMP_FILE}" --strip-components 1 -C "${GCLOUD_HOME}" \ - && rm -rf "${GCLOUD_TMP_FILE}" \ - && ${GCLOUD_HOME}/bin/gcloud components install beta \ - && echo '. /opt/gcloud/completion.bash.inc' >> /etc/bash.bashrc - -ENV PATH="$PATH:${GCLOUD_HOME}/bin" - -# Install AWS CLI -# Unfortunately, AWS does not provide a versioned bundle -ENV AWS_HOME="/opt/aws" - -RUN AWS_TMP_DIR="/tmp/awscli/" \ - && AWS_TMP_BUNDLE="${AWS_TMP_DIR}/awscli-bundle.zip" \ - && AWS_URL="https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" \ - && mkdir -pv "${AWS_TMP_DIR}" \ - && curl "${AWS_URL}" -o "${AWS_TMP_BUNDLE}" \ - && unzip "${AWS_TMP_BUNDLE}" -d "${AWS_TMP_DIR}" \ - && "${AWS_TMP_DIR}/awscli-bundle/install" -i "${AWS_HOME}" -b /usr/local/bin/aws \ - && echo "complete -C '${AWS_HOME}/bin/aws_completer' aws" >> /etc/bash.bashrc \ - && rm -rf "${AWS_TMP_DIR}" - ARG HOME=/root ENV HOME=${HOME} @@ -268,14 +190,43 @@ ENV AIRFLOW_SOURCES=${AIRFLOW_SOURCES} WORKDIR ${AIRFLOW_SOURCES} -RUN mkdir -pv ${AIRFLOW_HOME} \ - mkdir -pv ${AIRFLOW_HOME}/dags \ +RUN mkdir -pv ${AIRFLOW_HOME} && \ + mkdir -pv ${AIRFLOW_HOME}/dags && \ mkdir -pv ${AIRFLOW_HOME}/logs # Increase the value here to force reinstalling Apache Airflow pip dependencies -ARG PIP_DEPENDENCIES_EPOCH_NUMBER="2" +ARG PIP_DEPENDENCIES_EPOCH_NUMBER="4" ENV PIP_DEPENDENCIES_EPOCH_NUMBER=${PIP_DEPENDENCIES_EPOCH_NUMBER} +# Install BATS and its dependencies for "in container" tests +ARG BATS_VERSION="0.4.0" +ARG BATS_SUPPORT_VERSION="0.3.0" +ARG BATS_ASSERT_VERSION="2.0.0" +ARG BATS_FILE_VERSION="0.2.0" + +RUN curl -sSL https://github.com/bats-core/bats-core/archive/v${BATS_VERSION}.tar.gz -o /tmp/bats.tgz \ + && tar -zxf /tmp/bats.tgz -C /tmp \ + && /bin/bash /tmp/bats-core-${BATS_VERSION}/install.sh /opt/bats && rm -rf + +RUN mkdir -p /opt/bats/lib/bats-support \ + && curl -sSL https://github.com/bats-core/bats-support/archive/v${BATS_SUPPORT_VERSION}.tar.gz -o /tmp/bats-support.tgz \ + && tar -zxf /tmp/bats-support.tgz -C /opt/bats/lib/bats-support --strip 1 && rm -rf /tmp/* + +RUN mkdir -p /opt/bats/lib/bats-assert \ + && curl -sSL https://github.com/bats-core/bats-assert/archive/v${BATS_ASSERT_VERSION}.tar.gz -o /tmp/bats-assert.tgz \ + && tar -zxf /tmp/bats-assert.tgz -C /opt/bats/lib/bats-assert --strip 1 && rm -rf /tmp/* + +RUN mkdir -p /opt/bats/lib/bats-file \ + && curl -sSL https://github.com/bats-core/bats-file/archive/v${BATS_FILE_VERSION}.tar.gz -o /tmp/bats-file.tgz \ + && tar -zxf /tmp/bats-file.tgz -C /opt/bats/lib/bats-file --strip 1 && rm -rf /tmp/* + +RUN echo "export PATH=/opt/bats/bin:${PATH}" >> /root/.bashrc + +# Additional scripts for managing BATS addons +COPY scripts/docker/load.bash /opt/bats/lib/ +RUN chmod a+x /opt/bats/lib/load.bash + + # Optimizing installation of Cassandra driver # Speeds up building the image - cassandra driver without CYTHON saves around 10 minutes ARG CASS_DRIVER_NO_CYTHON="1" @@ -293,29 +244,47 @@ ENV AIRFLOW_BRANCH=${AIRFLOW_BRANCH} # Airflow Extras installed ARG AIRFLOW_EXTRAS="all" -ENV AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS} +ARG ADDITIONAL_AIRFLOW_EXTRAS="" +ENV AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS}${ADDITIONAL_AIRFLOW_EXTRAS:+,}${ADDITIONAL_AIRFLOW_EXTRAS} RUN echo "Installing with extras: ${AIRFLOW_EXTRAS}." -ARG AIRFLOW_CONTAINER_CI_OPTIMISED_BUILD="true" -ENV AIRFLOW_CONTAINER_CI_OPTIMISED_BUILD=${AIRFLOW_CONTAINER_CI_OPTIMISED_BUILD} +ARG AIRFLOW_CONSTRAINTS_REFERENCE="constraints-master" +ARG AIRFLOW_CONSTRAINTS_LOCATION="https://raw.githubusercontent.com/apache/airflow/${AIRFLOW_CONSTRAINTS_REFERENCE}/constraints-${PYTHON_MAJOR_MINOR_VERSION}.txt" +ENV AIRFLOW_CONSTRAINTS_LOCATION=${AIRFLOW_CONSTRAINTS_LOCATION} -# By changing the CI build epoch we can force reinstalling Arflow from the current master +# By changing the CI build epoch we can force reinstalling Airflow from the current master # It can also be overwritten manually by setting the AIRFLOW_CI_BUILD_EPOCH environment variable. -ARG AIRFLOW_CI_BUILD_EPOCH="1" +ARG AIRFLOW_CI_BUILD_EPOCH="5" ENV AIRFLOW_CI_BUILD_EPOCH=${AIRFLOW_CI_BUILD_EPOCH} -# In case of CI-optimised builds we want to pre-install master version of airflow dependencies so that +ARG AIRFLOW_PRE_CACHED_PIP_PACKAGES="true" +ENV AIRFLOW_PRE_CACHED_PIP_PACKAGES=${AIRFLOW_PRE_CACHED_PIP_PACKAGES} + +ARG INSTALL_FROM_DOCKER_CONTEXT_FILES="" +ENV INSTALL_FROM_DOCKER_CONTEXT_FILES=${INSTALL_FROM_DOCKER_CONTEXT_FILES} + +ARG INSTALL_FROM_PYPI="true" +ENV INSTALL_FROM_PYPI=${INSTALL_FROM_PYPI} + +RUN pip install --upgrade "pip==${PIP_VERSION}" + +# In case of CI builds we want to pre-install master version of airflow dependencies so that # We do not have to always reinstall it from the scratch. # This can be reinstalled from latest master by increasing PIP_DEPENDENCIES_EPOCH_NUMBER. -# And is automatically reinstalled from the scratch every month -RUN \ - if [[ "${AIRFLOW_CONTAINER_CI_OPTIMISED_BUILD}" == "true" ]]; then \ +# And is automatically reinstalled from the scratch every time patch release of python gets released +RUN if [[ ${AIRFLOW_PRE_CACHED_PIP_PACKAGES} == "true" ]]; then \ pip install \ - "https://github.com/${AIRFLOW_REPO}/archive/${AIRFLOW_BRANCH}.tar.gz#egg=apache-airflow[${AIRFLOW_EXTRAS}]" \ - && pip uninstall --yes apache-airflow; \ + "https://github.com/${AIRFLOW_REPO}/archive/${AIRFLOW_BRANCH}.tar.gz#egg=apache-airflow[${AIRFLOW_EXTRAS}]" \ + --constraint "${AIRFLOW_CONSTRAINTS_LOCATION}" \ + && pip uninstall --yes apache-airflow; \ fi +# Generate random hex dump file so that we can determine whether it's faster to rebuild the image +# using current cache (when our dump is the same as the remote onb) or better to pull +# the new image (when it is different) +RUN head -c 30 /dev/urandom | xxd -ps >/build-cache-hash + # Link dumb-init for backwards compatibility (so that older images also work) RUN ln -sf /usr/bin/dumb-init /usr/local/bin/dumb-init @@ -339,20 +308,33 @@ COPY airflow/version.py ${AIRFLOW_SOURCES}/airflow/version.py COPY airflow/__init__.py ${AIRFLOW_SOURCES}/airflow/__init__.py COPY airflow/bin/airflow ${AIRFLOW_SOURCES}/airflow/bin/airflow -COPY requirements/requirements-python${PYTHON_MAJOR_MINOR_VERSION}.txt \ - ${AIRFLOW_SOURCES}/requirements/requirements-python${PYTHON_MAJOR_MINOR_VERSION}.txt +ARG UPGRADE_TO_LATEST_CONSTRAINTS="false" +ENV UPGRADE_TO_LATEST_CONSTRAINTS=${UPGRADE_TO_LATEST_CONSTRAINTS} # The goal of this line is to install the dependencies from the most current setup.py from sources # This will be usually incremental small set of packages in CI optimized build, so it will be very fast # In non-CI optimized build this will install all dependencies before installing sources. -# Usually we will install versions constrained to the current requirements file +# Usually we will install versions based on the dependencies in setup.py and upgraded only if needed. # But in cron job we will install latest versions matching setup.py to see if there is no breaking change -RUN \ - if [[ "${UPGRADE_TO_LATEST_REQUIREMENTS}" == "true" ]]; then \ - pip install -e ".[${AIRFLOW_EXTRAS}]" --upgrade; \ - else \ - pip install -e ".[${AIRFLOW_EXTRAS}]" \ - --constraint ${AIRFLOW_SOURCES}/requirements/requirements-python${PYTHON_MAJOR_MINOR_VERSION}.txt ; \ +# and push the constraints if everything is successful +RUN if [[ ${INSTALL_FROM_PYPI} == "true" ]]; then \ + if [[ "${UPGRADE_TO_LATEST_CONSTRAINTS}" != "false" ]]; then \ + pip install -e ".[${AIRFLOW_EXTRAS}]" --upgrade --upgrade-strategy eager; \ + pip install --upgrade "pip==${PIP_VERSION}"; \ + else \ + pip install -e ".[${AIRFLOW_EXTRAS}]" --upgrade --upgrade-strategy only-if-needed; \ + pip install --upgrade "pip==${PIP_VERSION}"; \ + fi; \ + fi + +# If wheel files are found in /docker-context-files during installation +# they are also installed additionally to whatever is installed from Airflow. +COPY docker-context-files/ /docker-context-files/ + +RUN if [[ ${INSTALL_FROM_DOCKER_CONTEXT_FILES} != "true" ]]; then \ + if ls /docker-context-files/*.{whl,tar.gz} 1> /dev/null 2>&1; then \ + pip install --no-deps /docker-context-files/*.{whl,tar.gz}; \ + fi ; \ fi # Copy all the www/ files we need to compile assets. Done as two separate COPY @@ -363,34 +345,33 @@ COPY airflow/www_rbac/static ${AIRFLOW_SOURCES}/airflow/www_rbac/static/ # Package JS/css for production RUN yarn --cwd airflow/www_rbac run prod -COPY entrypoint.sh /entrypoint.sh - -# Copy selected subdirectories only -COPY .github/ ${AIRFLOW_SOURCES}/.github/ -COPY dags/ ${AIRFLOW_SOURCES}/dags/ -COPY common/ ${AIRFLOW_SOURCES}/common/ -COPY licenses/ ${AIRFLOW_SOURCES}/licenses/ -COPY scripts/ci/ ${AIRFLOW_SOURCES}/scripts/ci/ -COPY docs/ ${AIRFLOW_SOURCES}/docs/ -COPY tests/ ${AIRFLOW_SOURCES}/tests/ -COPY airflow/ ${AIRFLOW_SOURCES}/airflow/ -COPY .coveragerc .rat-excludes .flake8 LICENSE MANIFEST.in NOTICE CHANGELOG.txt \ - .github pytest.ini \ - setup.cfg setup.py \ - ${AIRFLOW_SOURCES}/ - -# Needed for building images via docker-in-docker inside the docker -COPY Dockerfile.ci ${AIRFLOW_SOURCES}/Dockerfile.ci +COPY scripts/in_container/entrypoint_ci.sh /entrypoint +RUN chmod a+x /entrypoint + +# We can copy everything here. The Context is filtered by dockerignore. This makes sure we are not +# copying over stuff that is accidentally generated or that we do not need (such as egg-info) +# if you want to add something that is missing and you expect to see it in the image you can +# add it with ! in .dockerignore next to the airflow, test etc. directories there +COPY . ${AIRFLOW_SOURCES}/ # Install autocomplete for airflow -RUN register-python-argcomplete airflow >> ~/.bashrc +RUN if command -v airflow; then \ + register-python-argcomplete airflow >> ~/.bashrc ; \ + fi -# Install autocomplete for Kubeclt -RUN echo "source /etc/bash_completion" >> ~/.bashrc \ - && kubectl completion bash >> ~/.bashrc +# Install autocomplete for Kubectl +RUN echo "source /etc/bash_completion" >> ~/.bashrc WORKDIR ${AIRFLOW_SOURCES} +# Install Helm +ARG HELM_VERSION="v3.2.4" + +RUN SYSTEM=$(uname -s | tr '[:upper:]' '[:lower:]') \ + && HELM_URL="https://get.helm.sh/helm-${HELM_VERSION}-${SYSTEM}-amd64.tar.gz" \ + && curl --location "${HELM_URL}" | tar -xvz -O "${SYSTEM}"-amd64/helm > /usr/local/bin/helm \ + && chmod +x /usr/local/bin/helm + # Additional python deps to install ARG ADDITIONAL_PYTHON_DEPS="" @@ -402,8 +383,25 @@ WORKDIR ${AIRFLOW_SOURCES} ENV PATH="${HOME}:${PATH}" -EXPOSE 8080 +# Needed to stop Gunicorn from crashing when /tmp is now mounted from host +ENV GUNICORN_CMD_ARGS="--worker-tmp-dir /dev/shm/" + +ARG BUILD_ID +ENV BUILD_ID=${BUILD_ID} +ARG COMMIT_SHA +ENV COMMIT_SHA=${COMMIT_SHA} + +LABEL org.apache.airflow.distro="debian" \ + org.apache.airflow.distro.version="buster" \ + org.apache.airflow.module="airflow" \ + org.apache.airflow.component="airflow" \ + org.apache.airflow.image="airflow-ci" \ + org.apache.airflow.version="${AIRFLOW_VERSION}" \ + org.apache.airflow.uid="0" \ + org.apache.airflow.gid="0" \ + org.apache.airflow.buildId=${BUILD_ID} \ + org.apache.airflow.commitSha=${COMMIT_SHA} -ENTRYPOINT ["/usr/bin/dumb-init", "--", "/entrypoint.sh"] +EXPOSE 8080 -CMD ["--help"] +ENTRYPOINT ["/usr/bin/dumb-init", "--", "/entrypoint"] diff --git a/IMAGES.rst b/IMAGES.rst index 9557129bfc046..b62a71ac3a3d8 100644 --- a/IMAGES.rst +++ b/IMAGES.rst @@ -22,8 +22,13 @@ Airflow docker images Airflow has two images (build from Dockerfiles): -* CI image (Dockerfile.ci) - used for running tests and local development -* Production image (Dockerfile) - used to run production-ready Airflow installations + * Production image (Dockerfile) - that can be used to build your own production-ready Airflow installation + You can read more about building and using the production image in the + `Production Deployments `_ document. + The image is built using `Dockerfile `_ + + * CI image (Dockerfile.ci) - used for running tests and local development. The image is built using + `Dockerfile.ci `_ Image naming conventions ======================== @@ -34,13 +39,24 @@ The images are named as follows: where: -* ``BRANCH_OR_TAG`` - branch or tag used when creating the image. Examples: master, v1-10-test, 1.10.10 - The ``master`` and ``v1-10-test`` labels are built from branches so they change over time. the 1.10.* and in +* ``BRANCH_OR_TAG`` - branch or tag used when creating the image. Examples: ``master``, ``v1-10-test``, ``1.10.15`` + The ``master`` and ``v1-10-test`` labels are built from branches so they change over time. The ``1.10.*`` and in the future ``2.*`` labels are build from git tags and they are "fixed" once built. -* PYTHON_MAJOR_MINOR_VERSION - version of python used to build the image. Examples: 3.5, 3.7 +* ``PYTHON_MAJOR_MINOR_VERSION`` - version of python used to build the image. Examples: ``3.5``, ``3.7`` * The ``-ci`` suffix is added for CI images * The ``-manifest`` is added for manifest images (see below for explanation of manifest images) +We also store (to increase speed of local build/pulls) python images that were used to build +the CI images. Each CI image, when built uses current python version of the base images. Those +python images are regularly updated (with bugfixes/security fixes), so for example python3.8 from +last week might be a different image than python3.8 today. Therefore whenever we push CI image +to airflow repository, we also push the python image that was used to build it this image is stored +as ``apache/airflow:python-3.8-``. + +Since those are simply snapshots of the existing python images, DockerHub does not create a separate +copy of those images - all layers are mounted from the original python images and those are merely +labels pointing to those. + Building docker images ====================== @@ -48,18 +64,18 @@ The easiest way to build those images is to use ``_. Note! Breeze by default builds production image from local sources. You can change it's behaviour by providing ``--install-airflow-version`` parameter, where you can specify the -tag/branch used to download Airflow package from in github repository. You can -also change the repository itself by adding --dockerhub-user and --dockerhub-repo flag values. +tag/branch used to download Airflow package from in GitHub repository. You can +also change the repository itself by adding ``--dockerhub-user`` and ``--dockerhub-repo`` flag values. You can build the CI image using this command: -.. code-block:: +.. code-block:: bash ./breeze build-image You can build production image using this command: -.. code-block:: +.. code-block:: bash ./breeze build-image --production-image @@ -67,16 +83,22 @@ By adding ``--python `` parameter you can build the image version for the chosen python version. The images are build with default extras - different extras for CI and production image and you -can change the extras via the ``--extras`` parameters. You can see default extras used via -``./breeze flags``. +can change the extras via the ``--extras`` parameters and add new ones with ``--additional-extras``. +You can see default extras used via ``./breeze flags``. For example if you want to build python 3.7 version of production image with "all" extras installed you should run this command: -.. code-block:: +.. code-block:: bash ./breeze build-image --python 3.7 --extras "all" --production-image +If you just want to add new extras you can add them like that: + +.. code-block:: bash + + ./breeze build-image --python 3.7 --additional-extras "all" --production-image + The command that builds the CI image is optimized to minimize the time needed to rebuild the image when the source code of Airflow evolves. This means that if you already have the image locally downloaded and built, the scripts will determine whether the rebuild is needed in the first place. Then the scripts will @@ -90,60 +112,265 @@ In Breeze by default, the airflow is installed using local sources of Apache Air You can also build production images from PIP packages via providing ``--install-airflow-version`` parameter to Breeze: -.. code-block:: +.. code-block:: bash - ./breeze build-image --python 3.7 --extras=gcp --production-image --install-airflow-version=1.10.9 + ./breeze build-image --python 3.7 --additional-extras=presto \ + --production-image --install-airflow-version=1.10.15 -This will build the image using command similar to: -.. code-block:: +.. note:: + + On November 2020, new version of PIP (20.3) has been released with a new, 2020 resolver. This resolver + does not yet work with Apache Airflow and might leads to errors in installation - depends on your choice + of extras. In order to install Airflow you need to either downgrade pip to version 20.2.4 + ``pip upgrade --pip==20.2.4`` or, in case you use Pip 20.3, you need to add option + ``--use-deprecated legacy-resolver`` to your pip install command. - pip install apache-airflow[gcp]==1.10.9 \ - --constraint https://raw.githubusercontent.com/apache/airflow/v1-10-test/requirements/requirements-python3.7.txt -This will also download entrypoint script from https://raw.githubusercontent.com/apache/airflow/v1-10-test/entrypoint.sh -url. It is important so that we have matching version of the requirements. +This will build the image using command similar to: + +.. code-block:: bash -The requirement files and entrypoint only appeared in version 1.10.10 of airflow so if you install -an earlier version - both constraint and requirements should point to 1.10.10 version. + pip install \ + apache-airflow[async,aws,azure,celery,dask,elasticsearch,gcp,kubernetes,mysql,postgres,redis,slack,ssh,statsd,virtualenv,presto]==1.10.15 \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-1.10.15/constraints-3.6.txt" You can also build production images from specific Git version via providing ``--install-airflow-reference`` -parameter to Breeze: +parameter to Breeze (this time constraints are taken from the ``constraints-master`` branch which is the +HEAD of development for constraints): -.. code-block:: +.. code-block:: bash + + pip install "https://github.com/apache/airflow/archive/.tar.gz#egg=apache-airflow" \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-master/constraints-3.6.txt" + +You can also skip installing airflow by providing ``--install-airflow-version none`` parameter to Breeze: + +.. code-block:: bash + + ./breeze build-image --python 3.7 --additional-extras=presto \ + --production-image --install-airflow-version=none --install-from-local-files-when-building + +In this case you usually install airflow and all packages in ``docker-context-files`` folder. + +Using cache during builds +========================= + +Default mechanism used in Breeze for building CI images uses images pulled from DockerHub or +GitHub Image Registry. This is done to speed up local builds and CI builds - instead of 15 minutes +for rebuild of CI images, it takes usually less than 3 minutes when cache is used. For CI builds this is +usually the best strategy - to use default "pull" cache. This is default strategy when +``_ builds are performed. + +For Production Image - which is far smaller and faster to build, it's better to use local build cache (the +standard mechanism that docker uses. This is the default strategy for production images when +``_ builds are performed. The first time you run it, it will take considerably longer time than +if you use the pull mechanism, but then when you do small, incremental changes to local sources, +Dockerfile image= and scripts further rebuilds with local build cache will be considerably faster. + +You can also disable build cache altogether. This is the strategy used by the scheduled builds in CI - they +will always rebuild all the images from scratch. + +You can change the strategy by providing one of the ``--build-cache-local``, ``--build-cache-pulled`` or +even ``--build-cache-disabled`` flags when you run Breeze commands. For example: + +.. code-block:: bash + + ./breeze build-image --python 3.7 --build-cache-local + +Will build the CI image using local build cache (note that it will take quite a long time the first +time you run it). + +.. code-block:: bash + + ./breeze build-image --python 3.7 --production-image --build-cache-pulled + +Will build the production image with pulled images as cache. + + +.. code-block:: bash + + ./breeze build-image --python 3.7 --production-image --build-cache-disabled + +Will build the production image from the scratch. - pip install https://github.com/apache/airflow/archive/.tar.gz#egg=apache-airflow \ - --constraint https://raw.githubusercontent.com/apache/airflow//requirements/requirements-python3.7.txt +You can also turn local docker caching by setting ``DOCKER_CACHE`` variable to "local", "pulled", +"disabled" and exporting it. -This will also Download entrypoint script from ``https://raw.githubusercontent.com/apache/airflow//entrypoint.sh`` -url. +.. code-block:: bash + + export DOCKER_CACHE="local" + +or + +.. code-block:: bash + + export DOCKER_CACHE="disabled" + + +Choosing image registry +======================= + +By default images are pulled and pushed from and to DockerHub registry when you use Breeze's push-image +or build commands. + +Our images are named like that: + +.. code-block:: bash + + apache/airflow:[-]-pythonX.Y - for production images + apache/airflow:[-]-pythonX.Y-ci - for CI images + apache/airflow:[-]-pythonX.Y-build - for production build stage + +For example: + +.. code-block:: bash + + apache/airflow:master-python3.6 - production "latest" image from current master + apache/airflow:master-python3.6-ci - CI "latest" image from current master + apache/airflow:v1-10-test-python2.7-ci - CI "latest" image from current v1-10-test branch + apache/airflow:1.10.15-python3.6 - production image for 1.10.15 release + apache/airflow:1.10.15-1-python3.6 - production image for 1.10.15 with some patches applied + + +You can see DockerHub images at ``_ + +By default DockerHub registry is used when you push or pull such images. +However for CI builds we keep the images in GitHub registry as well - this way we can easily push +the images automatically after merge requests and use such images for Pull Requests +as cache - which makes it much it much faster for CI builds (images are available in cache +right after merged request in master finishes it's build), The difference is visible especially if +significant changes are done in the Dockerfile.CI. + +The images are named differently (in Docker definition of image names - registry URL is part of the +image name if DockerHub is not used as registry). Also GitHub has its own structure for registries +each project has its own registry naming convention that should be followed. The name of +images for GitHub registry are: + +.. code-block:: bash + + docker.pkg.github.com/apache/airflow/-pythonX.Y - for production images + docker.pkg.github.com/apache/airflow/-pythonX.Y-ci - for CI images + docker.pkg.github.com/apache/airflow/-pythonX.Y-build - for production build state + +Note that we never push or pull TAG images to GitHub registry. It is only used for CI builds + +You can see all the current GitHub images at ``_ + +In order to interact with the GitHub images you need to add ``--github-registry`` flag to the pull/push +commands in Breeze. This way the images will be pulled/pushed from/to GitHub rather than from/to +DockerHub. Images are build locally as ``apache/airflow`` images but then they are tagged with the right +GitHub tags for you. + +You can read more about the CI configuration and how CI builds are using DockerHub/GitHub images +in ``_. + +Note that you need to be committer and have the right to push to DockerHub and GitHub and you need to +be logged in. Only committers can push images directly. Technical details of Airflow images =================================== -The CI image is used by Breeze as shell image but it is also used during CI builds on Travis. +The CI image is used by Breeze as shell image but it is also used during CI build. The image is single segment image that contains Airflow installation with "all" dependencies installed. -It is optimised for rebuild speed (AIRFLOW_CONTAINER_CI_OPTIMISED_BUILD flag set to "true"). -It installs PIP dependencies from the current branch first - so that any changes in setup.py do not trigger -reinstalling of all dependencies. There is a second step of installation that re-installs the dependencies +It is optimised for rebuild speed. It installs PIP dependencies from the current branch first - +so that any changes in setup.py do not trigger reinstalling of all dependencies. +There is a second step of installation that re-installs the dependencies from the latest sources so that we are sure that latest dependencies are installed. The production image is a multi-segment image. The first segment "airflow-build-image" contains all the build essentials and related dependencies that allow to install airflow locally. By default the image is -build from a released version of Airflow from Github, but by providing some extra arguments you can also +build from a released version of Airflow from GitHub, but by providing some extra arguments you can also build it from local sources. This is particularly useful in CI environment where we are using the image to run Kubernetes tests. See below for the list of arguments that should be provided to build production image from the local sources. -Manually building the images ----------------------------- +The image is primarily optimised for size of the final image, but also for speed of rebuilds - the +'airflow-build-image' segment uses the same technique as the CI builds for pre-installing PIP dependencies. +It first pre-installs them from the right GitHub branch and only after that final airflow installation is +done from either local sources or remote location (PIP or GitHub repository). + +Customizing the image +..................... + +Customizing the image is an alternative way of adding your own dependencies to the image. + +The easiest way to build the image image is to use ``breeze`` script, but you can also build such customized +image by running appropriately crafted docker build in which you specify all the ``build-args`` +that you need to add to customize it. You can read about all the args and ways you can build the image +in the `<#ci-image-build-arguments>`_ chapter below. + +Here just a few examples are presented which should give you general understanding of what you can customize. +This builds the production image in version 3.7 with additional airflow extras from 1.10.10 Pypi package and +additional apt dev and runtime dependencies. + +.. code-block:: bash + + docker build . -f Dockerfile.ci \ + --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ + --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 \ + --build-arg AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ + --build-arg AIRFLOW_VERSION="1.10.15" \ + --build-arg AIRFLOW_INSTALL_VERSION="==1.10.15" \ + --build-arg AIRFLOW_CONSTRAINTS_REFERENCE="constraints-1-10" \ + --build-arg AIRFLOW_SOURCES_FROM="empty" \ + --build-arg AIRFLOW_SOURCES_TO="/empty" \ + --build-arg ADDITIONAL_AIRFLOW_EXTRAS="jdbc" + --build-arg ADDITIONAL_PYTHON_DEPS="pandas" + --build-arg ADDITIONAL_DEV_APT_DEPS="gcc g++" + --build-arg ADDITIONAL_RUNTIME_APT_DEPS="default-jre-headless" + --tag my-image + + +the same image can be built using ``breeze`` (it supports auto-completion of the options): + +.. code-block:: bash + + ./breeze build-image -f Dockerfile.ci \ + --production-image --python 3.7 --install-airflow-version=1.10.15 \ + --additional-extras=jdbc --additional-python-deps="pandas" \ + --additional-dev-apt-deps="gcc g++" --additional-runtime-apt-deps="default-jre-headless" You can build the default production image with standard ``docker build`` command but they will only build default versions of the image and will not use the dockerhub versions of images as cache. -CI images -......... +You can customize more aspects of the image - such as additional commands executed before apt dependencies +are installed, or adding extra sources to install your dependencies from. You can see all the arguments +described below but here is an example of rather complex command to customize the image +based on example in `this comment `_: + +.. code-block:: bash + + docker build . -f Dockerfile.ci \ + --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ + --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 \ + --build-arg AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ + --build-arg AIRFLOW_VERSION="1.10.15" \ + --build-arg AIRFLOW_INSTALL_VERSION="==1.10.15" \ + --build-arg AIRFLOW_CONSTRAINTS_REFERENCE="constraints-1-10" \ + --build-arg AIRFLOW_SOURCES_FROM="empty" \ + --build-arg AIRFLOW_SOURCES_TO="/empty" \ + --build-arg ADDITIONAL_AIRFLOW_EXTRAS="slack" \ + --build-arg ADDITIONAL_PYTHON_DEPS="apache-airflow-backport-providers-odbc \ + azure-storage-blob \ + sshtunnel \ + google-api-python-client \ + oauth2client \ + beautifulsoup4 \ + dateparser \ + rocketchat_API \ + typeform" \ + --build-arg ADDITIONAL_DEV_APT_DEPS="msodbcsql17 unixodbc-dev g++" \ + --build-arg ADDITIONAL_DEV_APT_COMMAND="curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add --no-tty - && curl https://packages.microsoft.com/config/debian/10/prod.list > /etc/apt/sources.list.d/mssql-release.list" \ + --build-arg ADDITIONAL_DEV_ENV_VARS="ACCEPT_EULA=Y" \ + --build-arg ADDITIONAL_RUNTIME_APT_COMMAND="curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add --no-tty - && curl https://packages.microsoft.com/config/debian/10/prod.list > /etc/apt/sources.list.d/mssql-release.list" \ + --build-arg ADDITIONAL_RUNTIME_APT_DEPS="msodbcsql17 unixodbc git procps vim" \ + --build-arg ADDITIONAL_RUNTIME_ENV_VARS="ACCEPT_EULA=Y" \ + --tag my-image + +CI image build arguments +........................ The following build arguments (``--build-arg`` in docker build command) can be used for CI images: @@ -160,15 +387,9 @@ The following build arguments (``--build-arg`` in docker build command) can be u | ``DEPENDENCIES_EPOCH_NUMBER`` | ``2`` | increasing this number will reinstall | | | | all apt dependencies | +------------------------------------------+------------------------------------------+------------------------------------------+ -| ``KUBECTL_VERSION`` | ``v1.15.3`` | version of kubectl installed | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``KIND_VERSION`` | ``v0.6.1`` | version of kind installed | -+------------------------------------------+------------------------------------------+------------------------------------------+ | ``PIP_NO_CACHE_DIR`` | ``true`` | if true, then no pip cache will be | | | | stored | +------------------------------------------+------------------------------------------+------------------------------------------+ -| ``PIP_VERSION`` | ``19.0.2`` | version of PIP to use | -+------------------------------------------+------------------------------------------+------------------------------------------+ | ``HOME`` | ``/root`` | Home directory of the root user (CI | | | | image has root user as default) | +------------------------------------------+------------------------------------------+------------------------------------------+ @@ -183,34 +404,108 @@ The following build arguments (``--build-arg`` in docker build command) can be u | ``CASS_DRIVER_NO_CYTHON`` | ``1`` | if set to 1 no CYTHON compilation is | | | | done for cassandra driver (much faster) | +------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_CONTAINER_CI_OPTIMISED_BUILD`` | ``true`` | if set then PIP dependencies are | -| | | installed from repo first before they | -| | | are reinstalled from local sources. This | -| | | allows for incremental faster builds | -| | | when requirements change | -+------------------------------------------+------------------------------------------+------------------------------------------+ | ``AIRFLOW_REPO`` | ``apache/airflow`` | the repository from which PIP | -| | | dependencies are installed (CI | -| | | optimised) | +| | | dependencies are pre-installed | +------------------------------------------+------------------------------------------+------------------------------------------+ | ``AIRFLOW_BRANCH`` | ``master`` | the branch from which PIP dependencies | -| | | are installed (CI optimised) | +| | | are pre-installed | +------------------------------------------+------------------------------------------+------------------------------------------+ | ``AIRFLOW_CI_BUILD_EPOCH`` | ``1`` | increasing this value will reinstall PIP | | | | dependencies from the repository from | | | | scratch | +------------------------------------------+------------------------------------------+------------------------------------------+ +| ``AIRFLOW_CONSTRAINTS_LOCATION`` | | If not empty, it will override the | +| | | source of the constraints with the | +| | | specified URL or file. Note that the | +| | | file has to be in docker context so | +| | | it's best to place such file in | +| | | one of the folders included in | +| | | .dockerignore. for example in the | +| | | 'docker-context-files'. Note that the | +| | | location does not work for the first | +| | | stage of installation when the | +| | | stage of installation when the | +| | | ``AIRFLOW_PRE_CACHED_PIP_PACKAGES`` is | +| | | set to true. Default location from | +| | | GitHub is used in this case. | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``AIRFLOW_CONSTRAINTS_REFERENCE`` | ``constraints-master`` | reference (branch or tag) from GitHub | +| | | repository from which constraints are | +| | | used. By default it is set to | +| | | ``constraints-master`` but can be | +| | | ``constraints-1-10`` for 1.10.* versions | +| | | or it could point to specific version | +| | | for example ``constraints-1.10.12`` | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``INSTALL_FROM_DOCKER_CONTEXT_FILES`` | ``false`` | If set to true, Airflow and it's | +| | | dependencies are installed from locally | +| | | downloaded .whl files placed in the | +| | | ``docker-context-files``. | ++------------------------------------------+------------------------------------------+------------------------------------------+ | ``AIRFLOW_EXTRAS`` | ``all`` | extras to install | +------------------------------------------+------------------------------------------+------------------------------------------+ -| ``ADDITIONAL_PYTHON_DEPS`` | \```\` | additional python dependencies to | +| ``INSTALL_FROM_PYPI`` | ``true`` | If set to true, Airflow is installed | +| | | from pypi. If you want to install | +| | | Airflow from externally provided binary | +| | | package you can set it to false, place | +| | | the package in ``docker-context-files`` | +| | | and set | +| | | ``INSTALL_FROM_DOCKER_CONTEXT_FILES`` to | +| | | true. For this you have to also set the | +| | | ``AIRFLOW_PRE_CACHED_PIP_PACKAGES`` flag | +| | | to false | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``AIRFLOW_PRE_CACHED_PIP_PACKAGES`` | ``true`` | Allows to pre-cache airflow PIP packages | +| | | from the GitHub of Apache Airflow | +| | | This allows to optimize iterations for | +| | | Image builds and speeds up CI builds | +| | | But in some corporate environments it | +| | | might be forbidden to download anything | +| | | from public repositories. | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``ADDITIONAL_AIRFLOW_EXTRAS`` | | additional extras to install | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``ADDITIONAL_PYTHON_DEPS`` | | additional python dependencies to | | | | install | +------------------------------------------+------------------------------------------+------------------------------------------+ +| ``DEV_APT_COMMAND`` | (see Dockerfile) | Dev apt command executed before dev deps | +| | | are installed in the first part of image | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``ADDITIONAL_DEV_APT_COMMAND`` | | Additional Dev apt command executed | +| | | before dev dep are installed | +| | | in the first part of the image | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``DEV_APT_DEPS`` | (see Dockerfile) | Dev APT dependencies installed | +| | | in the first part of the image | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``ADDITIONAL_DEV_APT_DEPS`` | | Additional apt dev dependencies | +| | | installed in the first part of the image | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``ADDITIONAL_DEV_APT_ENV`` | | Additional env variables defined | +| | | when installing dev deps | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``RUNTIME_APT_COMMAND`` | (see Dockerfile) | Runtime apt command executed before deps | +| | | are installed in first part of the image | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``ADDITIONAL_RUNTIME_APT_COMMAND`` | | Additional Runtime apt command executed | +| | | before runtime dep are installed | +| | | in the second part of the image | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``RUNTIME_APT_DEPS`` | (see Dockerfile) | Runtime APT dependencies installed | +| | | in the second part of the image | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``ADDITIONAL_RUNTIME_APT_DEPS`` | | Additional apt runtime dependencies | +| | | installed in second part of the image | ++------------------------------------------+------------------------------------------+------------------------------------------+ +| ``ADDITIONAL_RUNTIME_APT_ENV`` | | Additional env variables defined | +| | | when installing runtime deps | ++------------------------------------------+------------------------------------------+------------------------------------------+ Here are some examples of how CI images can built manually. CI is always built from local sources. This builds the CI image in version 3.7 with default extras ("all"). -.. code-block:: +.. code-block:: bash docker build . -f Dockerfile.ci --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 @@ -218,161 +513,46 @@ This builds the CI image in version 3.7 with default extras ("all"). This builds the CI image in version 3.6 with "gcp" extra only. -.. code-block:: +.. code-block:: bash docker build . -f Dockerfile.ci --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ --build-arg PYTHON_MAJOR_MINOR_VERSION=3.6 --build-arg AIRFLOW_EXTRAS=gcp -Production images -................. +This builds the CI image in version 3.6 with "apache-beam" extra added. -The following build arguments (``--build-arg`` in docker build command) can be used for production images: +.. code-block:: bash -+------------------------------------------+------------------------------------------+------------------------------------------+ -| Build argument | Default value | Description | -+==========================================+==========================================+==========================================+ -| ``PYTHON_BASE_IMAGE`` | ``python:3.6-slim-buster`` | Base python image | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``PYTHON_MAJOR_MINOR_VERSION`` | ``3.6`` | major/minor version of Python (should | -| | | match base image) | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_VERSION`` | ``2.0.0.dev0`` | version of Airflow | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_ORG`` | ``apache`` | Github organisation from which Airflow | -| | | is installed (when installed from repo) | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_REPO`` | ``airflow`` | Github repository from which Airflow is | -| | | installed (when installed from repo) | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_GIT_REFERENCE`` | ``master`` | reference (branch or tag) from Github | -| | | repository from which Airflow is | -| | | installed (when installed from repo) | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``REQUIREMENTS_GIT_REFERENCE`` | ``master`` | reference (branch or tag) from Github | -| | | repository from which requirements are | -| | | downloaded for constraints (when | -| | | installed from repo). | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``WWW_FOLDER`` | ``www`` | folder where www pages are generated - | -| | | should be set to www_rbac in case of | -| | | 1.10 image builds. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_EXTRAS`` | (see Dockerfile) | Default extras with which airflow is | -| | | installed | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_HOME`` | ``/opt/airflow`` | Airflow’s HOME (that’s where logs and | -| | | sqlite databases are stored) | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_UID`` | ``50000`` | Airflow user UID | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_GID`` | ``50000`` | Airflow group GID | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``PIP_VERSION`` | ``19.0.2`` | version of PIP to use | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``CASS_DRIVER_BUILD_CONCURRENCY`` | ``8`` | Number of processors to use for | -| | | cassandra PIP install (speeds up | -| | | installing in case cassandra extra is | -| | | used). | -+------------------------------------------+------------------------------------------+------------------------------------------+ - -There are build arguments that determine the installation mechanism of Apache Airflow for the -production image. There are three types of build: - -* From local sources (by default for example when you use ``docker build .``) -* You can build the image from released PyPi airflow package (used to build the official Docker image) -* You can build the image from any version in GitHub repository(this is used mostly for system testing). - -+-----------------------------------+-----------------------------------+ -| Build argument | What to specify | -+===================================+===================================+ -| ``AIRFLOW_INSTALL_SOURCES`` | Should point to the sources of | -| | of Apache Airflow. It can be | -| | either "." for installation from | -| | local sources, "apache-airflow" | -| | for installation from packages | -| | and URL to installation from | -| | GitHub repository (see below) | -| | to install from any GitHub | -| | version | -+-----------------------------------+-----------------------------------+ -| ``AIRFLOW_INSTALL_VERSION`` | Optional - might be used for | -| | package installation case to | -| | set Airflow version for example | -| | "==1.10.10" | -+-----------------------------------+-----------------------------------+ -| ``CONSTRAINT_REQUIREMENTS`` | Should point to requirements file | -| | in case of installation from | -| | the package or from GitHub URL. | -| | See examples below | -+-----------------------------------+-----------------------------------+ -| ``ENTRYPOINT_FILE`` | Should point to entrypoint.sh | -| | file in case of installation from | -| | the package or from GitHub URL. | -| | See examples below | -+-----------------------------------+-----------------------------------+ -| ``AIRFLOW_WWW`` | In case of Airflow 2.0 it should | -| | be "www", in case of Airflow 1.10 | -| | series it should be "www_rbac". | -| | See examples below | -+-----------------------------------+-----------------------------------+ -| ``AIRFLOW_SOURCES_FROM`` | Sources of Airflow. Should be set | -| | to "Dockerfile" to avoid costly | -| | Docker context copying | -| | in case of installation from | -| | the package or from GitHub URL. | -| | See examples below | -+-----------------------------------+-----------------------------------+ -| ``AIRFLOW_SOURCES_TO`` | Target for Airflow sources. Set | -| | to "/Dockerfile" to avoid costly | -| | Docker context copying | -| | in case of installation from | -| | the package or from GitHub URL. | -| | See examples below | -+-----------------------------------+-----------------------------------+ - - -This builds production image in version 3.6 with default extras from the local sources: + docker build . -f Dockerfile.ci --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ + --build-arg PYTHON_MAJOR_MINOR_VERSION=3.6 --build-arg ADDITIONAL_AIRFLOW_EXTRAS="apache-beam" -.. code-block:: +This builds the CI image in version 3.6 with "mssql" additional package added. - docker build . +.. code-block:: bash -This builds the production image in version 3.7 with default extras from 1.10.9 tag and -requirements taken from v1-10-test branch in Github. -Note that versions 1.10.9 and below have no requirements so requirements should be taken from head of -the v1-10-test branch. Once we release 1.10.10 we can take them from the 1.10.10 tag. Also -Note that in case of Airflow 1.10 we need to specify "www_rbac" instead of "wwww" for -WWW_FOLDER argument. + docker build . -f Dockerfile.ci --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ + --build-arg PYTHON_MAJOR_MINOR_VERSION=3.6 --build-arg ADDITIONAL_PYTHON_DEPS="mssql" + +This builds the CI image in version 3.6 with "gcc" and "g++" additional apt dev dependencies added. .. code-block:: - docker build . \ - --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ - --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 \ - --build-arg AIRFLOW_INSTALL_SOURCES="https://github.com/apache/airflow/archive/1.10.10.tar.gz#egg=apache-airflow" \ - --build-arg CONSTRAINT_REQUIREMENTS="https://raw.githubusercontent.com/apache/airflow/1.10.10/requirements/requirements-python3.7.txt" \ - --build-arg ENTRYPOINT_FILE="https://raw.githubusercontent.com/apache/airflow/1.10.10/entrypoint.sh" \ - --build-arg SOURCES_FROM="Dockerfile" \ - --build-arg SOURCES_TO="/Dockerfile" \ - --build-arg WWW_FOLDER="www_rbac" + docker build . -f Dockerfile.ci --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ + --build-arg PYTHON_MAJOR_MINOR_VERSION=3.6 --build-arg ADDITIONAL_DEV_APT_DEPS="gcc g++" -This builds the production image in version 3.6 with default extras from current sources. +This builds the CI image in version 3.6 with "jdbc" extra and "default-jre-headless" additional apt runtime dependencies added. .. code-block:: - docker build . --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ - --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 --build-arg COPY_SOURCE=. \ - --build-arg COPY_TARGET=/opt/airflow --build-arg AIRFLOW_SOURCES=/opt/airflow \ - --build-arg CONSTRAINT_REQUIREMENTS=requirements/requirements-python3.7.txt" \ - --build-arg ENTRYPOINT_FILE=entrypoint.sh \ - --build-arg AIRFLOW_INSTALL_SOURCES="apache-airflow" \ - --build-arg AIRFLOW_INSTALL_VERSION="==1.10.10" \ - --build-arg CONSTRAINT_REQUIREMENTS="https://raw.githubusercontent.com/apache/airflow/1.10.10/requirements/requirements-python3.7.txt" - --build-arg ENTRYPOINT_FILE="https://raw.githubusercontent.com/apache/airflow/1.10.10/entrypoint.sh" \ - --build-arg SOURCES_FROM="Dockerfile" \ - --build-arg SOURCES_TO="/Dockerfile" \ - --build-arg WWW_FOLDER="www_rbac" + docker build . -f Dockerfile.ci --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ + --build-arg PYTHON_MAJOR_MINOR_VERSION=3.6 --build-arg AIRFLOW_EXTRAS=jdbc --build-arg ADDITIONAL_RUNTIME_DEPS="default-jre-headless" + +Production images +----------------- + +You can find details about using, building, extending and customising the production images in the +`Latest documentation `_ + Image manifests --------------- @@ -380,7 +560,7 @@ Image manifests Together with the main CI images we also build and push image manifests. Those manifests are very small images that contain only results of the docker inspect for the image. This is in order to be able to determine very quickly if the image in the docker registry has changed a lot since the last time. -Unfortunately docker registry (specifically dockerhub registry) has no anonymous way of querying image +Unfortunately docker registry (specifically DockerHub registry) has no anonymous way of querying image details via API, you need to download the image to inspect it. We overcame it in the way that always when we build the image we build a very small image manifest and push it to registry together with the main CI image. The tag for the manifest image is the same as for the image it refers @@ -399,29 +579,90 @@ You can do it via the ``--force-pull-images`` flag to force pulling the latest i For production image: -.. code-block:: +.. code-block:: bash ./breeze build-image --force-pull-images --production-image For CI image Breeze automatically uses force pulling in case it determines that your image is very outdated, however uou can also force it with the same flag. -.. code-block:: +.. code-block:: bash ./breeze build-image --force-pull-images -Using the images -================ +Embedded image scripts +====================== + +Both images have a set of scripts that can be used in the image. Those are: + * /entrypoint - entrypoint script used when entering the image + * /clean-logs - script for periodic log cleaning + +Running the CI image +==================== + +The entrypoint in the CI image contains all the initialisation needed for tests to be immediately executed. +It is copied from ``scripts/in_container/entrypoint_ci.sh``. + +The default behaviour is that you are dropped into bash shell. However if RUN_TESTS variable is +set to "true", then tests passed as arguments are executed + +The entrypoint performs those operations: + +* checks if the environment is ready to test (including database and all integrations). It waits + until all the components are ready to work + +* installs older version of Airflow (if older version of Airflow is requested to be installed + via ``INSTALL_AIRFLOW_VERSION`` variable. + +* Sets up Kerberos if Kerberos integration is enabled (generates and configures Kerberos token) + +* Sets up ssh keys for ssh tests and restarts teh SSH server + +* Sets all variables and configurations needed for unit tests to run + +* Reads additional variables set in ``files/airflow-breeze-config/variables.env`` by sourcing that file + +* In case of CI run sets parallelism to 2 to avoid excessive number of processes to run + +* In case of CI run sets default parameters for pytest + +* In case of running integration/long_running/quarantined tests - it sets the right pytest flags + +* Sets default "tests" target in case the target is not explicitly set as additional argument + +* Runs system tests if RUN_SYSTEM_TESTS flag is specified, otherwise runs regular unit and integration tests + + +Using, customising, and extending the production image +====================================================== + +You can read more about using, customising, and extending the production image in the +`documentation `_. + +Alpha versions of 1.10.10 production-ready images +================================================= + +The production images have been released for the first time in 1.10.10 release of Airflow as "Alpha" quality +ones. Between 1.10.10 the images are being improved and the 1.10.10 images should be patched and +published several times separately in order to test them with the upcoming Helm Chart. + +Those images are for development and testing only and should not be used outside of the +development community. + +The images were pushed with tags following the pattern: ``apache/airflow:1.10.10.1-alphaN-pythonX.Y``. +Patch level is an increasing number (starting from 1). + +Those are alpha-quality releases however they contain the officially released Airflow ``1.10.10`` code. +The main changes in the images are scripts embedded in the images. -Both images have entrypoint set as dumb-init with entrypoint.sh script executed (in order to forward -signals). This entrypoint works as follows: +The following versions were pushed: -* If ``AIRFLOW__CORE__SQL_ALCHEMY_CONN`` variable is passed to the container and it is either mysql or postgres - SQL alchemy connection, then the connection is checked and the script waits until the database is reachable. -* If no ``AIRFLOW__CORE__SQL_ALCHEMY_CONN`` variable is set or if it is set to sqlite SQL alchemy connection - then db reset is executed. -* If ``AIRFLOW__CELERY__BROKER_URL`` variable is passed and scheduler, worker of flower command is used then - the connection is checked and the script waits until the Celery broker database is reachable. ++-------+--------------------------------+----------------------------------------------------------+ +| Patch | Tag pattern | Description | ++=======+================================+==========================================================+ +| 1 | ``1.10.10.1-alpha1-pythonX.Y`` | Support for parameters added to bash and python commands | ++-------+--------------------------------+----------------------------------------------------------+ +| 2 | ``1.10.10-1-alpha2-pythonX.Y`` | Added "/clean-logs" script | ++-------+--------------------------------+----------------------------------------------------------+ -* If no argument is specified - you are dropped in bash shell. -* If there are any arguments they are passed to "airflow" command +The commits used to generate those images are tagged with ``prod-image-1.10.10.1-alphaN`` tags. diff --git a/INSTALL b/INSTALL index 40fed43698834..6338fb774804c 100644 --- a/INSTALL +++ b/INSTALL @@ -31,26 +31,49 @@ source PATH_TO_YOUR_VENV/bin/activate # [required] building and installing by pip (preferred) pip install . -# or directly +NOTE! + +On 30th of November 2020, new version of PIP (20.3) has been released with a new, 2020 resolver. +This resolver does not yet work with Apache Airflow and might leads to errors in installation - +depends on your choice of extras. In order to install Airflow you need to either downgrade +pip to version 20.2.4 ``pip upgrade --pip==20.2.4`` or, in case you use Pip 20.3, you need to add option +``--use-deprecated legacy-resolver`` to your pip install command. + + +# or you can install it directly via setup.py python setup.py install + # You can also install recommended version of the dependencies by using -# requirements-python.txt as constraint file. This is needed in case +# constraint-python.txt files as constraint file. This is needed in case # you have problems with installing the current requirements from PyPI. -# There are different requirements for different python versions. For example" +# There are different constraint files for different python versions and you shopuld choose the +# version of constraints specific for your version. +# For example: + +pip install . \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-1.10.15/constraints-3.6.txt" + -pip install . --constraint requirements/requirements-python3.7.txt +.. note:: + On 30th of November 2020, new version of PIP (20.3) has been released with a new, 2020 resolver. + This resolver does not yet work with Apache Airflow and might leads to errors in installation - + depends on your choice of extras. In order to install Airflow you need to either downgrade + pip to version 20.2.4 ``pip upgrade --pip==20.2.4`` or, in case you use Pip 20.3, you need to add option + ``--use-deprecated legacy-resolver`` to your pip install command. # You can also install Airflow with extras specified. The list of available extras: # START EXTRAS HERE -all, all_dbs, async, atlas, aws, azure, azure_blob_storage, azure_container_instances, azure_cosmos, -azure_data_lake, cassandra, celery, cgroups, cloudant, crypto, dask, databricks, datadog, devel, -devel_azure, devel_ci, devel_hadoop, doc, docker, druid, elasticsearch, emr, gcp, gcp_api, -github_enterprise, google_auth, grpc, hashicorp, hdfs, hive, jdbc, jira, kerberos, kubernetes, ldap, -mongo, mssql, mysql, oracle, papermill, password, pinot, postgres, presto, qds, rabbitmq, redis, s3, -salesforce, samba, segment, sendgrid, sentry, slack, snowflake, ssh, statsd, vertica, virtualenv, -webhdfs, winrm +all, all_dbs, amazon, apache.atlas, apache.cassandra, apache.druid, apache.hdfs, apache.hive, +apache.pinot, apache.presto, apache.webhdfs, async, atlas, aws, azure, azure_blob_storage, +azure_container_instances, azure_cosmos, azure_data_lake, azure_secrets, cassandra, celery, cgroups, +cloudant, cncf.kubernetes, crypto, dask, databricks, datadog, devel, devel_all, devel_azure, +devel_ci, devel_hadoop, doc, docker, druid, elasticsearch, emr, gcp, gcp_api, github_enterprise, +google, google_auth, grpc, hashicorp, hdfs, hive, jdbc, jira, kerberos, kubernetes, ldap, +microsoft.azure, microsoft.mssql, microsoft.winrm, mongo, mssql, mysql, oracle, papermill, password, +pinot, postgres, presto, qds, rabbitmq, redis, s3, salesforce, samba, segment, sendgrid, sentry, +slack, snowflake, ssh, statsd, vertica, virtualenv, webhdfs, winrm # END EXTRAS HERE diff --git a/INTHEWILD.md b/INTHEWILD.md new file mode 100644 index 0000000000000..fe65c239411b2 --- /dev/null +++ b/INTHEWILD.md @@ -0,0 +1,407 @@ + + +## Who uses Apache Airflow? + +As the Apache Airflow community grows, we'd like to keep track of who is using +the platform. Please send a PR with your company name and @githubhandle. + +Currently, **officially** using Airflow: + +1. [4G Capital](http://www.4g-capital.com/) [[@posei](https://github.com/posei)] +1. [6play](https://www.6play.fr) [[@lemourA](https://github.com/lemoura), [@achaussende](https://github.com/achaussende), [@d-nguyen](https://github.com/d-nguyen), [@julien-gm](https://github.com/julien-gm)] +1. [8fit](https://8fit.com/) [[@nicor88](https://github.com/nicor88), [@frnzska](https://github.com/frnzska)] +1. [90 Seconds](https://90seconds.tv/) [[@aaronmak](https://github.com/aaronmak)] +1. [99](https://99taxis.com) [[@fbenevides](https://github.com/fbenevides), [@gustavoamigo](https://github.com/gustavoamigo) & [@mmmaia](https://github.com/mmmaia)] +1. [AMPATH](https://www.ampathkenya.org/) [[@AMPATH](https://github.com/AMPATH), [@fatmali](https://github.com/fatmali)] +1. [ARGO Labs](http://www.argolabs.org) [[@California Data Collaborative](https://github.com/California-Data-Collaborative)] +1. [ARMEDANGELS](https://www.armedangels.de) [[@swiffer](https://github.com/swiffer)] +1. [AdBOOST](https://www.adboost.sk) [[AdBOOST](https://github.com/AdBOOST)] +1. [Adobe](https://www.adobe.com/) [[@mishikaSingh](https://github.com/mishikaSingh), [@ramandumcs](https://github.com/ramandumcs), [@vardancse](https://github.com/vardancse)] +1. [Agari](https://github.com/agaridata) [[@r39132](https://github.com/r39132)] +1. [Agoda](https://agoda.com) [[@akki](https://github.com/akki)] +1. [AirDNA](https://www.airdna.co) +1. [Airbnb](http://airbnb.io/) [[@mistercrunch](https://github.com/mistercrunch), [@artwr](https://github.com/artwr)] +1. [Airfinity](https://www.airfinity.com) [[@sibowyer](https://github.com/sibowyer)] +1. [Airtel](https://www.airtel.in/) [[@harishbisht](https://github.com/harishbisht)] +1. [Akamai](https://www.akamai.com/) [[@anirudhbagri](https://github.com/anirudhbagri)] +1. [Akamas](https://akamas.io) [[@GiovanniPaoloGibilisco](https://github.com/GiovanniPaoloGibilisco), [@lucacavazzana](https://github.com/lucacavazzana)] +1. [Alan](https://alan.eu) [[@charles-go](https://github.com/charles-go)] +1. [AloPeyk](https://alopeyk.com) [[@blcksrx](https://github.com/blcksrx), [@AloPeyk](https://github.com/AloPeyk)] +1. [AltX](https://www.getaltx.com/about) [[@pedromduarte](https://github.com/pedromduarte)] +1. [American Family Insurance](https://www.amfam.com/about) [[@di1eep](https://github.com/di1eep)] +1. [Apigee](https://apigee.com) [[@btallman](https://github.com/btallman)] +1. [Arquivei](https://www.arquivei.com.br/) [[@arquivei](https://github.com/arquivei)] +1. [Arrive](https://www.arrive.com/) +1. [Asana](https://asana.com/) [[@chang](https://github.com/chang), [@dima-asana](https://github.com/dima-asana), [@jdavidheiser](https://github.com/jdavidheiser), [@ricardoandresrojas](https://github.com/ricardoandresrojas)] +1. [Astronomer](http://www.astronomer.io) [[@schnie](https://github.com/schnie), [@ashb](https://github.com/ashb), [@kaxil](https://github.com/kaxil), [@dimberman](https://github.com/dimberman), [@andriisoldatenko](https://github.com/andriisoldatenko), [@ryw](https://github.com/ryw), [@ryanahamilton](https://github.com/ryanahamilton), [@jhtimmins](https://github.com/jhtimmins), [@vikramkoka](https://github.com/vikramkoka)] +1. [Auth0](https://auth0.com) [[@scottypate](https://github.com/scottypate)], [[@dm03514](https://github.com/dm03514)], [[@karangale](https://github.com/karangale)] +1. [Automattic](https://automattic.com/) [[@anandnalya](https://github.com/anandnalya), [@bperson](https://github.com/bperson), [@khrol](https://github.com/Khrol), [@xyu](https://github.com/xyu)] +1. [Avesta Technologies](https://avestatechnologies.com) [[@TheRum](https://github.com/TheRum)] +1. [Away](https://awaytravel.com) [[@trunsky](https://github.com/trunsky)] +1. [Axesor designeted activity company](https://www.axesor.es/) +1. [Azri Solutions](http://www.azrisolutions.com/) [[@userimack](https://github.com/userimack)] +1. [BBM](https://www.bbm.com/) +1. [Bagelcode](https://site.bagelcode.com/) +1. [BalanceHero](http://truebalance.io/) [[@swalloow](https://github.com/swalloow)] +1. [Banco de Formaturas](https://www.bancodeformaturas.com.br) [[@guiligan](https://github.com/guiligan)] +1. [BandwidthX](http://www.bandwidthx.com) [[@dineshdsharma](https://github.com/dineshdsharma)] +1. [Basetis](http://www.basetis.com) +1. [Beamly](https://www.beamly.com/) [[@christopheralcock](https://github.com/christopheralcock)] +1. [Beeswax](https://beeswax.com/) +1. [Bellhops](https://github.com/bellhops) +1. [BelugaDB](https://belugadb.com) [[@fabio-nukui](https://github.com/fabio-nukui) & [@joao-sallaberry](http://github.com/joao-sallaberry) & [@lucianoviola](https://github.com/lucianoviola) & [@tmatuki](https://github.com/tmatuki)] +1. [Betterment](https://www.betterment.com/) [[@betterment](https://github.com/Betterment)] +1. [Bexs Bank](https://www.bexs.com.br/en) [[@felipefb](https://github.com/felipefb) & [@ilarsen](https://github.com/ishvann)] +1. [BigQuant](https://bigquant.com/) [[@bigquant](https://github.com/bigquant)] +1. [Birdz by Veolia](https://www.birdz.com/en/) [[@benjamingrenier](https://github.com/benjamingrenier)] +1. [BlaBlaCar](https://www.blablacar.com) [[@puckel](https://github.com/puckel) & [@wmorin](https://github.com/wmorin)] +1. [Blacklane](https://www.blacklane.com) [[@serkef](https://github.com/serkef)] +1. [Bloc](https://www.bloc.io) [[@dpaola2](https://github.com/dpaola2)] +1. [Bloomberg](https://www.techatbloomberg.com) [[@dimberman](https://github.com/dimberman)] +1. [Bloomreach](https://www.bloomreach.com/) [[@neelborooah](https://github.com/neelborooah) & [@debodirno](https://github.com/debodirno) & [@ayushmnnit](https://github.com/ayushmnnit)] +1. [Blue Yonder](http://www.blue-yonder.com) [[@blue-yonder](https://github.com/blue-yonder)] +1. [BlueApron](https://www.blueapron.com) [[@jasonjho](https://github.com/jasonjho) & [@matthewdavidhauser](https://github.com/matthewdavidhauser)] +1. [Bluecore](https://www.bluecore.com) [[@JLDLaughlin](https://github.com/JLDLaughlin)] +1. [Bluekiri](https://bluekiri.com) [[@Bluekiri](https://github.com/bluekiri)] +1. [Boda Telecom Suite - CE](https://github.com/bodastage/bts-ce) [[@erssebaggala](https://github.com/erssebaggala), [@bodastage](https://github.com/bodastage)] +1. [Bodastage Solutions](http://bodastage.com) [[@erssebaggala](https://github.com/erssebaggala), [@bodastage](https://github.com/bodastage)] +1. [Bombora Inc](https://bombora.com/) [[@jeffkpayne](https://github.com/jeffkpayne), [@pakelley](https://github.com/pakelley), [@dNavalta](https://github.com/dNavalta), [@austynh](https://github.com/austynh), [@TheOriginalAlex](https://github.com/TheOriginalAlex)] +1. [Bonial International GmbH](https://www.bonial.com/) +1. [Bonnier Broadcasting](http://www.bonnierbroadcasting.com) [[@wileeam](https://github.com/wileeam)] +1. [BounceX](http://www.bouncex.com) [[@JoshFerge](https://github.com/JoshFerge), [@hudsonrio](https://github.com/hudsonrio), [@ronniekritou](https://github.com/ronniekritou)] +1. [Braintree](https://www.braintreepayments.com) [[@coopergillan](https://github.com/coopergillan), [@curiousjazz77](https://github.com/curiousjazz77), [@raymondberg](https://github.com/raymondberg)] +1. [Branch](https://branch.io) [[@sdebarshi](https://github.com/sdebarshi), [@dmitrig01](https://github.com/dmitrig01)] +1. [CAVA](https://www.cava.com) [[@minh5](http://github.com/minh5) & [@patchus](http://github.com/patchus)] +1. [Caesars Entertainment](https://www.caesars.com) +1. [California Data Collaborative](https://github.com/California-Data-Collaborative) powered by [ARGO Labs](http://www.argolabs.org) +1. [Capital One](https://www.capitalone.com) [[@anoopengineer](https://github.com/anoopengineer)] +1. [CarLabs](https://www.carlabs.ai/) [[@sganz](https://github.com/sganz) & [@odannyc](https://github.com/odannyc)] +1. [Carbonite](https://www.carbonite.com) [[@ajbosco](https://github.com/ajbosco)] +1. [Celect](http://www.celect.com) [[@superdosh](https://github.com/superdosh) & [@chadcelect](https://github.com/chadcelect)] +1. [Censys](https://censys.io) [[@zakird](https://github.com/zakird), [@dadrian](https://github.com/dadrian), & [@andrewsardone](https://github.com/andrewsardone)] +1. [Change.org](https://www.change.org) [[@change](https://github.com/change), [@vijaykramesh](https://github.com/vijaykramesh)] +1. [Chartboost](https://www.chartboost.com) [[@cgelman](https://github.com/cgelman) & [@dclubb](https://github.com/dclubb)] +1. [Checkr](https://checkr.com) [[@tongboh](https://github.com/tongboh)] +1. [Children's Hospital of Philadelphia Division of Genomic Diagnostics](http://www.chop.edu/centers-programs/division-genomic-diagnostics) [[@genomics-geek](https://github.com/genomics-geek/)] +1. [Cinimex DataLab](http://cinimex.ru) [[@kdubovikov](https://github.com/kdubovikov)] +1. [City of San Diego](http://sandiego.gov) [[@MrMaksimize](https://github.com/mrmaksimize), [@andrell81](https://github.com/andrell81) & [@arnaudvedy](https://github.com/arnaudvedy)] +1. [City of Toronto](https://www.toronto.ca/) [[@CityofToronto](https://github.com/CityofToronto), [@radumas](https://github.com/radumas)] +1. [Civey](https://civey.com/) [[@WesleyBatista](https://github.com/WesleyBatista)] +1. [Clairvoyant](https://clairvoyantsoft.com) [[@shekharv](https://github.com/shekharv)] +1. [Classmethod, Inc.](https://classmethod.jp/) [[@shoito](https://github.com/shoito)] +1. [Cleartax](https://cleartax.in/) [[@anks](https://github.com/anks) & [@codebuff](https://github.com/codebuff)] +1. [Cloudera](https://www.cloudera.com/) [[@phraniiac](https://github.com/phraniiac) & [@VivekPemawat](https://github.com/VivekPemawat)] +1. [Clover Health](https://www.cloverhealth.com) [[@gwax](https://github.com/gwax) & [@vansivallab](https://github.com/vansivallab)] +1. [Colgate-Palmolive](https://www.colgatepalmolive.com/) [[@fhoda](https://github.com/fhoda)] +1. [Collectivehealth Inc.](https://www.collectivehealth.com) [[@retornam](https://github.com/retornam)] +1. [Compass](https://www.compass.com) [[@wdhorton](https://github.com/wdhorton)] +1. [ConnectWise](https://www.connectwise.com/) [[@jacobeturpin](https://github.com/jacobeturpin)] +1. [ContaAzul](https://www.contaazul.com) [[@bern4rdelli](https://github.com/bern4rdelli), [@renanleme](https://github.com/renanleme) & [@sabino](https://github.com/sabino)] +1. [Cotap](https://github.com/cotap/) [[@maraca](https://github.com/maraca) & [@richardchew](https://github.com/richardchew)] +1. [Craig@Work](https://www.craigatwork.com) +1. [Crealytics](https://crealytics.com) +1. [Credit Karma](https://www.creditkarma.com/) [[@preete-dixit-ck](https://github.com/preete-dixit-ck) & [@harish-gaggar-ck](https://github.com/harish-gaggar-ck) & [@greg-finley-ck](https://github.com/greg-finley-ck)] +1. [CreditCards.com](https://www.creditcards.com/) [[@vmAggies](https://github.com/vmAggies) & [@jay-wallaby](https://github.com/jay-wallaby)] +1. [Creditas](https://www.creditas.com.br) [[@dcassiano](https://github.com/dcassiano)] +1. [Cryptalizer.com](https://www.cryptalizer.com/) +1. [Currency](https://www.gocurrency.com/) [[@FCLI](https://github.com/FCLI) & [@alexbegg](https://github.com/alexbegg)] +1. [Custom Ink](https://www.customink.com/) [[@david-dalisay](https://github.com/david-dalisay), [@dmartin11](https://github.com/dmartin11) & [@mpeteuil](https://github.com/mpeteuil)] +1. [Cyscale](https://cyscale.com) [[@ocical](https://github.com/ocical)] +1. [Dailymotion](http://www.dailymotion.com/fr) [[@germaintanguy](https://github.com/germaintanguy) & [@hc](https://github.com/hc)] +1. [Danamica](https://www.danamica.dk) [[@testvinder](https://github.com/testvinder)] +1. [Data Reply](https://www.datareply.co.uk/) [[@kaxil](https://github.com/kaxil)] +1. [DataCamp](https://datacamp.com/) [[@dgrtwo](https://github.com/dgrtwo)] +1. [DataFox](https://www.datafox.com/) [[@sudowork](https://github.com/sudowork)] +1. [DataSprints](https://datasprints.com/) [[@lopesdiego12](https://github.com/lopesdiego12) & [@rafaelsantanaep](https://github.com/rafaelsantanaep)] +1. [Datamaran](https://www.datamaran.com) [[@valexharo](https://github.com/valexharo)] +1. [Datumo](https://datumo.io) [[@michalmisiewicz](https://github.com/michalmisiewicz)] +1. [Dentsu Inc.](http://www.dentsu.com/) [[@bryan831](https://github.com/bryan831) & [@loozhengyuan](https://github.com/loozhengyuan)] +1. [Deseret Digital Media](http://deseretdigital.com/) [[@formigone](https://github.com/formigone) +1. [Digital First Media](http://www.digitalfirstmedia.com/) [[@duffn](https://github.com/duffn) & [@mschmo](https://github.com/mschmo) & [@seanmuth](https://github.com/seanmuth)] +1. [DigitalOcean](https://digitalocean.com/) [[@ajbosco](https://github.com/ajbosco)] +1. [Digitas Pixelpark](https://www.digitaspixelpark.com/) [[@feluelle](https://github.com/feluelle)] +1. [DoorDash](https://www.doordash.com/) +1. [Dotmodus](http://dotmodus.com) [[@dannylee12](https://github.com/dannylee12)] +1. [Drivy](https://www.drivy.com) [[@AntoineAugusti](https://github.com/AntoineAugusti)] +1. [Dynata](https://www.dynata.com) [[@neil3handari](https://github.com/neil3handari)] +1. [EBANX](https://www.ebanx.com/) [[@estevammr](https://github.com/estevammr) & [@nathangngencissk](https://github.com/nathangngencissk) & [@raafaadg](https://github.com/raafaadg) & [@whrocha](https://github.com/whrocha)] +1. [Easy Taxi](http://www.easytaxi.com/) [[@caique-lima](https://github.com/caique-lima) & [@diraol](https://github.com/diraol)] +1. [Elai Data](https://www.elaidata.com/) [[@lgov](https://github.com/lgov)] +1. [EllisDon](http://www.ellisdon.com/) [[@d2kalra](https://github.com/d2kalra) & [@zbasama](https://github.com/zbasama)] +1. [Endesa](https://www.endesa.com) [[@drexpp](https://github.com/drexpp)] +1. [Enigma](https://www.enigma.com) [[@hydrosquall](https://github.com/hydrosquall)] +1. [Etsy](https://www.etsy.com) [[@mchalek](https://github.com/mchalek)] +1. [Everis](https://www.everis.com) [[@diegobenedicto](https://github.com/diegobenedicto)] +1. [Everlane](https://everlane.com) [[@NickBenthem](https://github.com/NickBenthem)] +1. [Experity (formerly DocuTAP)](https://www.experityhealth.com/) [[@cloneluke](https://github.com/cloneluke) & [@tobyjoliver](https://github.com/tobyjoliver)] +1. [FanDuel](https://www.fanduel.com/) +1. [Farfetch](https://github.com/farfetch) [[@davidmarques78](https://github.com/davidmarques78)] +1. [Fathom Health](https://www.fathomhealth.co/) +1. [Firestone Inventing](https://www.hsmap.com/) [[@zihengCat](https://github.com/zihengCat)] +1. [Flipp](https://www.flipp.com) [[@sethwilsonwishabi](https://github.com/sethwilsonwishabi)] +1. [Format](https://www.format.com) [[@format](https://github.com/4ormat) & [@jasonicarter](https://github.com/jasonicarter)] +1. [FreeNow](https://free-now.com) [[@freenowtech](https://github.com/freenowtech)] +1. [FreshBooks](https://github.com/freshbooks) [[@DinoCow](https://github.com/DinoCow)] +1. [Freshworks](https://www.freshworks.com/) [[@shaikshakeel](https://github.com/shaikshakeel)] +1. [FullContact](https://github.com/fullcontact) +1. [Fuller, Inc.](https://en.fuller-inc.com/) [[@wutali](https://github.com/wutali) & [@sh-tech](https://github.com/sh-tech)] +1. [Fundera](https://fundera.com) [[@andyxhadji](https://github.com/andyxhadji)] +1. [G Adventures](https://gadventures.com) [[@chchtv11](https://github.com/chchtv11), [@tgumbley](https://github.com/tgumbley), [@tomwross](https://github.com/tomwross)] +1. [GSN Games](https://www.gsngames.com) +1. [GameWisp](https://gamewisp.com) [[@tjbiii](https://github.com/TJBIII) & [@theryanwalls](https://github.com/theryanwalls)] +1. [Geekie](https://www.geekie.com.br) [[@wolney](https://github.com/wolney)] +1. [GeneCards](https://www.genecards.org) [[@oferze](https://github.com/oferze)] +1. [Gentner Lab](http://github.com/gentnerlab) [[@neuromusic](https://github.com/neuromusic)] +1. [Get Simpl](https://getsimpl.com/) [[@rootcss](https://github.com/rootcss)] +1. [GitLab](https://about.gitlab.com/) [[@tayloramurphy](https://gitlab.com/tayloramurphy) & [@m_walker](https://gitlab.com/m_walker)] +1. [Glassdoor](https://github.com/Glassdoor) [[@syvineckruyk](https://github.com/syvineckruyk) & [@sid88in](https://github.com/sid88in)] +1. [Global Fashion Group](http://global-fashion-group.com) [[@GFG](https://github.com/GFG)] +1. [GoDataDriven](https://godatadriven.com/) [[@BasPH](https://github.com/basph), [@danielvdende](https://github.com/danielvdende), [@ffinfo](https://github.com/ffinfo), [@Fokko](https://github.com/Fokko), [@gglanzani](https://github.com/gglanzani), [@hgrif](https://github.com/hgrif), [@jrderuiter](https://github.com/jrderuiter), [@NielsZeilemaker](https://github.com/NielsZeilemaker)] +1. [Gojek](https://gojek.com/) [[@gojek](https://github.com/gojek), [@rootcss](https://github.com/rootcss)] +1. [GovTech GDS](https://gds-gov.tech) [[@chrissng](https://github.com/chrissng) & [@datagovsg](https://github.com/datagovsg)] +1. [Grab](https://www.grab.com/sg/) [[@calvintran](https://github.com/canhtran)] +1. [Gradeup](https://gradeup.co) [[@gradeup](https://github.com/gradeup)] +1. [Grand Rounds](https://www.grandrounds.com/) [[@richddr](https://github.com/richddr), [@timz1290](https://github.com/timz1290), [@wenever](https://github.com/@wenever), & [@runongirlrunon](https://github.com/runongirlrunon)] +1. [Greytip](https://www.greytip.com) [[@greytip](https://github.com/greytip)] +1. [Groupalia](http://es.groupalia.com) [[@jesusfcr](https://github.com/jesusfcr)] +1. [Groupon](https://groupon.com) [[@stevencasey](https://github.com/stevencasey)] +1. [Growbots](https://www.growbots.com/) [[@exploy](https://github.com/exploy)] +1. [GrowthSimple](https://growthsimple.ai/) +1. [Gusto](https://gusto.com) [[@frankhsu](https://github.com/frankhsu)] +1. [HAVAN](https://www.havan.com.br) [[@botbiz](https://github.com/botbiz)] +1. [HBC Digital](http://tech.hbc.com) [[@tmccartan](https://github.com/tmccartan) & [@dmateusp](https://github.com/dmateusp)] +1. [HBO](http://www.hbo.com/) [[@yiwang](https://github.com/yiwang)] +1. [Handshake](https://joinhandshake.com/) [[@mhickman](https://github.com/mhickman)] +1. [Handy](http://www.handy.com/careers/73115?gh_jid=73115&gh_src=o5qcxn) [[@marcintustin](https://github.com/marcintustin) / [@mtustin-handy](https://github.com/mtustin-handy)] +1. [Healthjump](http://www.healthjump.com/) [[@miscbits](https://github.com/miscbits)] +1. [HelloFresh](https://www.hellofresh.com) [[@tammymendt](https://github.com/tammymendt) & [@davidsbatista](https://github.com/davidsbatista) & [@iuriinedostup](https://github.com/iuriinedostup)] +1. [Hipages](https://www.hipages.com.au/) [[@arihantsurana](https://github.com/arihantsurana)] +1. [Holimetrix](http://holimetrix.com/) [[@thibault-ketterer](https://github.com/thibault-ketterer)] +1. [HomeToGo](https://www.hometogo.com/) [[@HomeToGo](https://github.com/hometogo), [@AurimasGr](https://github.com/AurimasGr)] +1. [Hootsuite](https://github.com/hootsuite) +1. [Hostnfly](https://www.hostnfly.com/) [[@CyrilLeMat](https://github.com/CyrilLeMat) & [@pierrechopin](https://github.com/pierrechopin) & [@alexisrosuel](https://github.com/alexisrosuel)] +1. [HotelQuickly](https://github.com/HotelQuickly) [[@zinuzoid](https://github.com/zinuzoid)] +1. [Huq Industries](https://huq.io) [[@huqindustries](https://github.com/huq-industries), [@alepuccetti](https://github.com/alepuccetti), [@turbomerl](https://github.com/turbomerl)] +1. [Hurb](https://hurb.com/) [[@hurbcom](https://github.com/hurbcom)] +1. [IFTTT](https://www.ifttt.com/) [[@apurvajoshi](https://github.com/apurvajoshi)] +1. [ING](http://www.ing.com/) +1. [Iflix](https://piay.iflix.com) [[@ChaturvediSulabh](https://github.com/ChaturvediSulabh)] +1. [Indeed](https://www.indeed.com/) [[@chrismclennon](https://github.com/chrismclennon), [@raj-manvar](https://github.com/raj-manvar), [@Adil-Ibragimov](https://github.com/Adil-Ibragimov)] +1. [Instacart 🥕](http://www.instacart.com/) [[@arp1t](https://github.com/arp1t) & [@code-sauce](https://github.com/code-sauce) & [@jasonlew](https://github.com/jasonlew) & [@j4p3](https://github.com/j4p3) & [@lubert](https://github.com/lubert) & [@mmontagna](https://github.com/mmontagna) & [@RyanAD](https://github.com/RyanAD) &[@zzadeh](https://github.com/zzadeh)] +1. [Intercom](http://www.intercom.com/) [[@fox](https://github.com/fox) & [@paulvic](https://github.com/paulvic)] +1. [Interia](http://www.interia.pl) +1. [Investorise](https://investorise.com/) [[@svenvarkel](https://github.com/svenvarkel)] +1. [JULO](https://www.julo.co.id/) [[@sepam](https://github.com/sepam) & [@tenapril](https://github.com/tenapril) & [@verzqy](https://github.com/verzqy)] +1. [Jampp](https://github.com/jampp) +1. [Jeitto](https://www.jeitto.com.br) [[@BrennerPablo](https://github.com/BrennerPablo) & [@ds-mauri](https://github.com/ds-mauri)] +1. [Jetlore](http://www.jetlore.com/) [[@bderose](https://github.com/bderose)] +1. [JobTeaser](https://www.jobteaser.com) [[@stefani75](https://github.com/stefani75) & [@knil-sama](https://github.com/knil-sama)] +1. [Jobrapido](https://www.jobrapido.com/) [[@mattiagiupponi](https://github.com/mattiagiupponi)] +1. [KPN B.V.](https://www.kpn.com/) [[@biyanisuraj](https://github.com/biyanisuraj) & [@gmic](https://github.com/gmic)] +1. [Kalibrr](https://www.kalibrr.com/) [[@charlesverdad](https://github.com/charlesverdad)] +1. [Kargo](https://kargo.com) [[@chaithra-yenikapati](https://github.com/chaithra-yenikapati), [@akarsh3007](https://github.com/akarsh3007) & [@dineshanchan](https://github.com/dineshanchan)] +1. [Karmic](https://karmiclabs.com) [[@hyw](https://github.com/hyw)] +1. [King Abdullah Petroleum Studies and Research Center(KAPSARC)](https://github.com/kapsarc) [[@saianupkumarp](https://github.com/saianupkumarp)] +1. [King](https://king.com) [[@nathadfield](https://github.com/nathadfield)] +1. [Kiwi.com](https://kiwi.com/) [[@underyx](https://github.com/underyx)] +1. [Kogan.com](https://github.com/kogan) [[@geeknam](https://github.com/geeknam)] +1. [Korbit](https://www.korbit.co.kr/) [[@jensenity](https://github.com/jensenity)] +1. [Kroton Educacional](http://www.kroton.com.br/) +1. [LeMans Corporation](https://www.parts-unlimited.com/) [[@alloydwhitlock](https://github.com/alloydwhitlock)] & [[@tinyrye](https://github.com/tinyrye)] +1. [Lemann Foundation](http://fundacaolemann.org.br) [[@fernandosjp](https://github.com/fernandosjp)] +1. [LendUp](https://www.lendup.com/) [[@lendup](https://github.com/lendup)] +1. [LetsBonus](http://www.letsbonus.com) [[@jesusfcr](https://github.com/jesusfcr) & [@OpringaoDoTurno](https://github.com/OpringaoDoTurno)] +1. [Liberty Global](https://www.libertyglobal.com/) [[@LibertyGlobal](https://github.com/LibertyGlobal/)] +1. [LingoChamp](http://www.liulishuo.com/) [[@haitaoyao](https://github.com/haitaoyao)] +1. [Logitravel Group](https://www.logitravel.com/) +1. [LokSuvidha](http://loksuvidha.com/) [[@saurabhwahile](https://github.com/saurabhwahile)] +1. [Los Angeles Times](http://www.latimes.com/) [[@standyro](https://github.com/standyro)] +1. [Lucid](http://luc.id) [[@jbrownlucid](https://github.com/jbrownlucid) & [@kkourtchikov](https://github.com/kkourtchikov)] +1. [Lumos Labs](https://www.lumosity.com/) [[@rfroetscher](https://github.com/rfroetscher/) & [@zzztimbo](https://github.com/zzztimbo/)] +1. [Lyft](https://www.lyft.com/) [[@feng-tao](https://github.com/feng-tao), [@milton0825](https://github.com/milton0825), [@astahlman](https://github.com/astahlman), [@youngyjd](https://github.com/youngyjd), [@ArgentFalcon](https://github.com/ArgentFalcon)] +1. [M4U](https://www.m4u.com.br/) [[@msantino](https://github.com/msantino)] +1. [MFG Labs](https://github.com/MfgLabs) +1. [Madrone](http://madroneco.com/) [[@mbreining](https://github.com/mbreining) & [@scotthb](https://github.com/scotthb)] +1. [Markovian](https://markovian.com/) [[@al-xv](https://github.com/al-xv), [@skogsbaeck](https://github.com/skogsbaeck), [@waltherg](https://github.com/waltherg)] +1. [Mercadoni](https://www.mercadoni.com.co) [[@demorenoc](https://github.com/demorenoc)] +1. [Mercari](http://www.mercari.com/) [[@yu-iskw](https://github.com/yu-iskw)] +1. [MeuVendoo](https://www.meuvendoo.com.br) [[@CarlosDutra](https://github.com/CarlosDutra)] +1. [MiNODES](https://www.minodes.com) [[@dice89](https://github.com/dice89), [@diazcelsa](https://github.com/diazcelsa)] +1. [Ministry of Economy of Brazil](https://www.gov.br/economia/) [[@nitaibezerra](https://github.com/nitaibezerra), [@vitorbellini](https://github.com/vitorbellini)] +1. [Modernizing Medicine](https://www.modmed.com/) [[@kehv1n](https://github.com/kehv1n), [@dalupus](https://github.com/dalupus)] +1. [Movember](https://movember.com) +1. [Multiply](https://www.multiply.com) [[@nrhvyc](https://github.com/nrhvyc)] +1. [NEXT Trucking](https://www.nexttrucking.com/) [[@earthmancash2](https://github.com/earthmancash2), [@kppullin](https://github.com/kppullin)] +1. [National Bank of Canada](https://nbc.ca) [[@brilhana](https://github.com/brilhana)] +1. [Nav, Inc.](https://nav.com/) [[@tigerjz32](https://github.com/tigerjz32)] +1. [Neoway](https://www.neoway.com.br/) [[@neowaylabs](https://github.com/orgs/NeowayLabs/people)] +1. [Nerdwallet](https://www.nerdwallet.com) +1. [New Relic](https://www.newrelic.com) [[@marcweil](https://github.com/marcweil)] +1. [Newzoo](https://www.newzoo.com) [[@newzoo-nexus](https://github.com/newzoo-nexus)] +1. [Nextdoor](https://nextdoor.com) [[@SivaPandeti](https://github.com/SivaPandeti), [@zshapiro](https://github.com/zshapiro) & [@jthomas123](https://github.com/jthomas123)] +1. [Nielsen](https://www.nielsen.com) [[@roitvt](https://github.com/roitvt) & [@itaiy](https://github.com/itaiy)] +1. [Nine](https://nine.com.au) [[@TheZepto](https://github.com/TheZepto)] +1. [OVH](https://www.ovh.com) [[@ncrocfer](https://github.com/ncrocfer) & [@anthonyolea](https://github.com/anthonyolea)] +1. [OdysseyPrime](https://www.goprime.io/) [[@davideberdin](https://github.com/davideberdin)] +1. [OfferUp](https://offerupnow.com) +1. [OneFineStay](https://www.onefinestay.com) [[@slangwald](https://github.com/slangwald)] +1. [Open Knowledge International](https://okfn.org) [@vitorbaptista](https://github.com/vitorbaptista) +1. [OpenSlate](https://openslate.com) [@marcusianlevine](https://github.com/marcusianlevine) +1. [Opensignal](https://www.opensignal.com) [@harrisjoseph](https://github.com/harrisjoseph) +1. [Optum](https://www.optum.com/) - [UnitedHealthGroup](https://www.unitedhealthgroup.com/) [[@fhoda](https://github.com/fhoda), [@ianstanton](https://github.com/ianstanton), [@nilaybhatt](https://github.com/NilayBhatt),[@hiteshrd](https://github.com/hiteshrd)] +1. [OrangeBank](https://www.orangebank.fr/) [[@HamzaBoukraa](https://github.com/HamzaBoukraa)] +1. [Outcome Health](https://www.outcomehealth.com/) [[@mikethoun](https://github.com/mikethoun), [@rolandotribo](https://github.com/rolandotribo)] +1. [Overstock](https://www.github.com/overstock) [[@mhousley](https://github.com/mhousley) & [@mct0006](https://github.com/mct0006)] +1. [PAYMILL](https://www.paymill.com/) [[@paymill](https://github.com/paymill) & [@matthiashuschle](https://github.com/matthiashuschle)] +1. [PMC](https://pmc.com/) [[@andrewm4894](https://github.com/andrewm4894)] +1. [PXYData](https://www.pxydata.com) [[@patchus](http://github.com/patchus)] +1. [Pagar.me](https://pagar.me/) [[@pagarme](https://github.com/pagarme)] +1. [Palo Alto Networks](https://www.paloaltonetworks.com/) [[@PaloAltoNetworks](https://github.com/PaloAltoNetworks)] +1. [Pandora Media](https://www.pandora.com/) [[@Acehaidrey](https://github.com/Acehaidrey) & [@wolfier](https://github.com/wolfier)] +1. [Paraná Banco](https://paranabanco.com.br/) [[@lopesdiego12](https://github.com/lopesdiego12/)] +1. [PayFit](https://payfit.com) [[@pcorbel](https://github.com/pcorbel)] +1. [PayPal](https://www.paypal.com/) [[@r39132](https://github.com/r39132) & [@jhsenjaliya](https://github.com/jhsenjaliya)] +1. [Pecan](https://www.pecan.ai) [[@ohadmata](https://github.com/ohadmata)] +1. [Pernod-Ricard](https://www.pernod-ricard.com/) [[@romain-nio](https://github.com/romain-nio)] +1. [Plaid](https://www.plaid.com/) [[@plaid](https://github.com/plaid), [@AustinBGibbons](https://github.com/AustinBGibbons) & [@jeeyoungk](https://github.com/jeeyoungk)] +1. [Playbuzz](https://www.playbuzz.com/) [[@clintonboys](https://github.com/clintonboys) & [@dbn](https://github.com/dbn)] +1. [Playsimple Games](https://playsimple.in/) [[@joshi95](https://github.com/joshi95)] +1. [Polidea](https://www.polidea.com/) [[@potiuk](https://github.com/potiuk), [@mschickensoup](https://github.com/mschickensoup), [@mik-laj](https://github.com/mik-laj), [@turbaszek](https://github.com/turbaszek), [@michalslowikowski00](https://github.com/michalslowikowski00), [@olchas](https://github.com/olchas), [@debek](https://github.com/debek), [@FHoffmannCode](https://github.com/FHoffmannCode), [@TobKed](https://github.com/TobKed)] +1. [Poshmark](https://www.poshmark.com) +1. [Postmates](http://www.postmates.com) [[@syeoryn](https://github.com/syeoryn)] +1. [Premise](http://www.premise.com) [[@jmccallum-premise](https://github.com/jmccallum-premise)] +1. [Promofarma](https://www.promofarma.com/) [[@JavierLopezT](https://github.com/JavierLopezT)] +1. [Pronto Tools](http://www.prontotools.io/) [[@zkan](https://github.com/zkan) & [@mesodiar](https://github.com/mesodiar)] +1. [PubNub](https://pubnub.com) [[@jzucker2](https://github.com/jzucker2)] +1. [Qoala](https://www.qoala.id) [[@gnomeria](https://github.com/gnomeria), [@qoala-engineering](https://github.com/qoala-engineering)] +1. [Qplum](https://qplum.co) [[@manti](https://github.com/manti)] +1. [Quantopian](https://www.quantopian.com/) [[@eronarn](http://github.com/eronarn)] +1. [Qubole](https://qubole.com) [[@msumit](https://github.com/msumit)] +1. [QuintoAndar](https://quintoandar.com.br) [[@quintoandar](https://github.com/quintoandar)] +1. [Quizlet](https://quizlet.com) [[@quizlet](https://github.com/quizlet)] +1. [Quora](https://www.quora.com/) +1. [REA Group](https://www.rea-group.com/) +1. [Raisin](https://www.raisin.com/) [[@davido912](https://github.com/davido912)] +1. [Rakuten](https://www.rakuten.com) +1. [Rapido](https://rapido.bike/) [[@ChethanUK](https://github.com/ChethanUK)] +1. [Raízen](https://www.raizen.com.br/) [[@rudlac](https://github.com/rudlac) & [@guifneves](https://github.com/guifneves)] +1. [Reddit](https://www.reddit.com/) [[@reddit](https://github.com/reddit/)] +1. [Reverb](https://reverb.com)[[@reverbdotcom](https://github.com/reverbdotcom)] +1. [Revolut](https://www.revolut.com/) [[@sztanko](https://github.com/sztanko) & [@nautilus28](https://github.com/nautilus28)] +1. [Robinhood](https://robinhood.com) [[@vineet-rh](https://github.com/vineet-rh)] +1. [RushOwl](https://www.rushowl.sg) [[@songyanho](https://github.com/songyanho)] +1. [Scaleway](https://scaleway.com) [[@kdeldycke](https://github.com/kdeldycke)] +1. [Seasoned](https://www.seasoned.co/) [[@joshuacano](https://github.com/joshuacano)] & [[@mmyers](https://github.com/mmyers5)] & [[@tjward](https://github.com/tjward)] +1. [Secret Escapes](https://www.secretescapes.com) [[@secretescapes](https://github.com/secretescapes)] +1. [Semantics3](https://www.semantics3.com) [[@abishekk92](https://github.com/abishekk92)] +1. [Sense360](https://github.com/Sense360) [[@kamilmroczek](https://github.com/KamilMroczek)] +1. [Sentry.io](https://www.sentry.io) [[@tiopi](https://github.com/tiopi)] +1. [ShopBack](https://www.shopback.sg/) [[@shopback](https://github.com/shopback)] +1. [Shopkick](https://shopkick.com/) [[@shopkick](https://github.com/shopkick)] +1. [Sidecar](https://hello.getsidecar.com/) [[@getsidecar](https://github.com/getsidecar)] +1. [SimilarWeb](https://www.similarweb.com/) [[@similarweb](https://github.com/similarweb)] +1. [Simply Business](https://www.simplybusiness.com/) [[@simplybusiness](https://github.com/simplybusiness)] +1. [Skyscanner](https://www.skyscanner.net/) [[@skyscanner](https://github.com/Skyscanner)] +1. [SmartNews](https://www.smartnews.com/) [[@takus](https://github.com/takus)] +1. [SnapTravel](https://www.snaptravel.com/) +1. [SocialCops](https://www.socialcops.com/) [[@vinayak-mehta](https://github.com/vinayak-mehta) & [@sharky93](https://github.com/sharky93)] +1. [Société générale](https://www.societegenerale.fr/) [[@medmrgh](https://github.com/medmrgh) & [@s83](https://github.com/s83)] +1. [SpotHero](https://github.com/spothero) [[@benjigoldberg](https://github.com/benjigoldberg)] +1. [Spotahome](https://www.spotahome.com/) [[@spotahome](https://github.com/spotahome)] +1. [Spotify](https://github.com/spotify) [[@znichols](https://github.com/znichols)] +1. [Square](https://squareup.com/) +1. [Stackspace](https://beta.stackspace.io/) +1. [StoneCo](https://www.stone.co) [[@lgwacker](https://github.com/lgwacker)] +1. [Strava](https://strava.com) [[@strava](https://github.com/strava), [@dhuang](https://github.com/dhuang) & [@liamstewart](https://github.com/liamstewart)] +1. [Stripe](https://stripe.com) [[@jbalogh](https://github.com/jbalogh)] +1. [Strongmind](https://www.strongmind.com) [[@tomchapin](https://github.com/tomchapin) & [@wongstein](https://github.com/wongstein)] +1. [Surfline](https://www.surfline.com/) [[@jawang35](https://github.com/jawang35)] +1. [Syapse](https://www.syapse.com/) [[@zedmor](https://github.com/zedmor)] +1. [T2 Systems](http://t2systems.com) [[@unclaimedpants](https://github.com/unclaimedpants)] +1. [TEK](https://www.tek.fi/en) [[@telac](https://github.com/telac)] +1. [THE ICONIC](https://www.theiconic.com.au/) [[@revathijay](https://github.com/revathijay), [@ilikedata](https://github.com/ilikedata)] +1. [Tails.com](https://tails.com/) [[@alanmcruickshank](https://github.com/alanmcruickshank)] +1. [Telefonica Innovation Alpha](https://www.alpha.company/) [[@Alpha-Health](https://github.com/Alpha-health)] +1. [Telia Company](https://www.teliacompany.com/en) +1. [Ternary Data](https://ternarydata.com/) [[@mhousley](https://github.com/mhousley), [@JoeReis](https://github.com/JoeReis)] +1. [Tesla](https://www.tesla.com/) [[@thoralf-gutierrez](https://github.com/thoralf-gutierrez)] +1. [The Climate Corporation](https://climate.com/) [[@jmelching](https://github.com/jmelching)] +1. [The Home Depot](https://www.homedepot.com/) [[@apekshithr](https://github.com/apekshithr)] +1. [Thinking Machines](https://thinkingmachin.es) [[@marksteve](https://github.com/marksteve)] +1. [Thinknear](https://www.thinknear.com/) [[@d3cay1](https://github.com/d3cay1), [@ccson](https://github.com/ccson), & [@ababian](https://github.com/ababian)] +1. [ThoughtWorks](https://www.thoughtworks.com/) [[@sann3](https://github.com/sann3)] +1. [ThredUP](https://www.thredup.com/) [[@kosteev](https://github.com/kosteev)] +1. [Thumbtack](https://www.thumbtack.com/) [[@kamalacharya](https://github.com/kamalacharya), [@dwjoss](https://github.com/dwjoss)] +1. [Tictail](https://tictail.com/) +1. [Tile](https://tile.com/) [[@ranjanmanish](https://github.com/ranjanmanish)] +1. [Tinder](https://tinder.com/) [[@kbendick](https://github.com/kbendick)] +1. [Tink](https://tink.com/) [[@tink-ab](https://github.com/tink-ab)] +1. [TokenAnalyst](https://github.com/tokenanalyst) [[@simonohanlon101](https://github.com/simonohanlon101), [@ankitchiplunkar](https://github.com/ankitchiplunkar), [@sidshekhar](https://github.com/sidshekhar), [@sp6pe](https://github.com/sp6pe)] +1. [Tokopedia](https://www.tokopedia.com/) [[@topedmaria](https://github.com/topedmaria)] +1. [Trocafone](https://www.trocafone.com/) [[@idontdomath](https://github.com/idontdomath) & [@gseva](https://github.com/gseva) & [@ordonezf](https://github.com/ordonezf) & [@PalmaLeandro](https://github.com/PalmaLeandro)] +1. [TruFactor](https://trufactor.io/) [[@gholmes](https://github.com/gholmes) & [@angadsingh](https://github.com/angadsingh/)] +1. [Twine Labs](https://www.twinelabs.com/) [[@ivorpeles](https://github.com/ivorpeles)] +1. [Twitter](https://www.twitter.com/) [[@aoen](https://github.com/aoen)] +1. [USC Graduate School, University of Southern California](https://graduateschool.usc.edu/) [[@abhilash1in](https://github.com/abhilash1in), [@sudarshansunder](https://github.com/sudarshansunder)] +1. [Ubisoft](https://www.ubisoft.com/) [[@Walkoss](https://github.com/Walkoss)] +1. [Udacity](https://www.udacity.com/) [[@dandikunited](https://github.com/DandikUnited), [@simon-uc](https://github.com/simon-uc)] +1. [Umami Collective](https://umamicollective.com) [[@juanuicich](https://github.com/juanuicich)] +1. [United Airlines](https://www.united.com/) [[@ilopezfr](https://github.com/ilopezfr)] +1. [Upsight](https://www.upsight.com) +1. [VeeR VR](https://veer.tv) [[@pishilong](https://github.com/pishilong)] +1. [Veikkaus](https://www.veikkaus.fi) [[@hixus](https://github.com/hixus)] +1. [Vente-Exclusive.com](http://www.vente-exclusive.com/) [[@alexvanboxel](https://github.com/alexvanboxel)] +1. [Vevo](https://www.vevo.com/) [[@csetiawan](https://github.com/csetiawan) & [@jerrygillespie](https://github.com/jerrygillespie)] +1. [Vidio](https://www.vidio.com/) +1. [Ville de Montréal](http://ville.montreal.qc.ca/) [[@VilledeMontreal](https://github.com/VilledeMontreal/)] +1. [Vnomics](https://github.com/vnomics) [[@lpalum](https://github.com/lpalum)] +1. [Walmart Labs](https://www.walmartlabs.com) [[@bharathpalaksha](https://github.com/bharathpalaksha), [@vipul007ravi](https://github.com/vipul007ravi)] +1. [Waze](https://www.waze.com) [[@waze](https://github.com/wazeHQ)] +1. [WePay](http://www.wepay.com) [[@criccomini](https://github.com/criccomini) & [@mtagle](https://github.com/mtagle)] +1. [WeTransfer](https://github.com/WeTransfer) [[@coredipper](https://github.com/coredipper) & [@higee](https://github.com/higee) & [@azclub](https://github.com/azclub)] +1. [Whistle Labs](http://www.whistle.com) [[@ananya77041](https://github.com/ananya77041)] +1. [Wildlifestudios](https://wildlifestudios.com/) +1. [WiseBanyan](https://wisebanyan.com/) +1. [WixAnswers](https://www.wixanswers.com/) [[@eladkal](https://github.com/eladkal)] +1. [Wix](https://www.wix.com/) [[@eladkal](https://github.com/eladkal)] +1. [Wooga](https://www.wooga.com/) +1. [WorldRemit](https://www.worldremit.com/) [[@boittega](https://github.com/boittega)] +1. [Wrike](https://www.wrike.com) [[@eliseealex](https://github.com/eliseealex) & [teoretic6](https://github.com/Teoretic6)] +1. [Xero](https://www.xero.com/) [[@yan9yu](https://github.com/yan9yu) & [adamantnz](https://github.com/adamantnz/)] +1. [Xoom](https://www.xoom.com/) +1. [Yahoo!](https://www.yahoo.com/) +1. [Yieldr](https://www.yieldr.com/) [[@ggeorgiadis](https://github.com/ggeorgiadis)] +1. [Zapier](https://www.zapier.com) [[@drknexus](https://github.com/drknexus) & [@statwonk](https://github.com/statwonk)] +1. [Zego](https://www.zego.com/) [[@ruimffl](https://github.com/ruimffl), [@james-welly](https://github.com/james-welly), [@ken-payne](https://github.com/ken-payne)] +1. [Zendesk](https://www.github.com/zendesk) +1. [Zenly](https://zen.ly) [[@cerisier](https://github.com/cerisier) & [@jbdalido](https://github.com/jbdalido)] +1. [Zerodha](https://zerodha.com/) [[@johnnybravo-xyz](https://github.com/johnnybravo-xyz)] +1. [Zymergen](https://www.zymergen.com/) +1. [Zynga](https://www.zynga.com) +1. [allegro.pl](http://allegro.tech/) [[@kretes](https://github.com/kretes)] +1. [ciValue](https://civalue.com/) [[@chencivalue](https://github.com/chencivalue), [@YoavGaudin](https://github.com/YoavGaudin), [@saleem-boshnak](https://github.com/saleem-boshnak)] +1. [evo.company](https://evo.company/) [[@orhideous](https://github.com/orhideous)] +1. [happn](https://www.happn.com) [[@pcorbel](https://github.com/pcorbel)] +1. [iHeartRadio](http://www.iheart.com/) [[@yiwang](https://github.com/yiwang)] +1. [iS2.co](https://www.is2.co) [[@iS2co](https://github.com/iS2co)] +1. [imgix](https://www.imgix.com/) [[@dclubb](https://github.com/dclubb)] +1. [liligo](http://liligo.com/) [[@tromika](https://github.com/tromika)] +1. [proton.ai](https://proton.ai/) [[@prmsolutions](https://github.com/prmsolutions)] +1. [uSmart Securities](https://www.usmartsecurities.com/hk/en/) [[@yangrong688](https://github.com/yangrong688)] diff --git a/LICENSE b/LICENSE index 4cac7bd45ddcf..a0663a682d884 100644 --- a/LICENSE +++ b/LICENSE @@ -241,7 +241,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt. (MIT License) ElasticMock v1.3.2 (https://github.com/vrcmarcos/elasticmock) (MIT License) MomentJS v2.22.2 (http://momentjs.com/) (MIT License) moment-strftime v0.5.0 (https://github.com/benjaminoakes/moment-strftime) - (MIT License) python-slugify v2.0.1 (https://github.com/un33k/python-slugify) + (MIT License) python-slugify v4.0.0 (https://github.com/un33k/python-slugify) (MIT License) python-nvd3 v0.15.0 (https://github.com/areski/python-nvd3) (MIT License) eonasdan-bootstrap-datetimepicker v4.17.37 (https://github.com/eonasdan/bootstrap-datetimepicker/) diff --git a/LOCAL_VIRTUALENV.rst b/LOCAL_VIRTUALENV.rst index e45e76cd66add..03b60c81df028 100644 --- a/LOCAL_VIRTUALENV.rst +++ b/LOCAL_VIRTUALENV.rst @@ -36,11 +36,11 @@ These are examples of the development options available with the local virtualen * local debugging; * Airflow source view; -* autocompletion; +* auto-completion; * documentation support; * unit tests. -This document describes minimum requirements and insructions for using a standalone version of the local virtualenv. +This document describes minimum requirements and instructions for using a standalone version of the local virtualenv. Prerequisites ============= @@ -118,6 +118,23 @@ To create and initialize the local virtualenv: pip install -U -e ".[devel,]" # for example: pip install -U -e ".[devel,gcp,postgres]" +.. note:: + On 30th of November 2020, new version of PIP (20.3) has been released with a new, 2020 resolver. + This resolver does not yet work with Apache Airflow and might leads to errors in installation - + depends on your choice of extras. In order to install Airflow you need to either downgrade + pip to version 20.2.4 ``pip upgrade --pip==20.2.4`` or, in case you use Pip 20.3, you need to add option + ``--use-deprecated legacy-resolver`` to your pip install command. + + +In case you have problems with installing airflow because of some requirements are not installable, you can +try to install it with the set of working constraints (note that there are different constraint files +for different python versions: + + .. code-block:: bash + + pip install -U -e ".[devel,]" \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-master/constraints-3.6.txt" + Note: when you first initialize database (the next step), you may encounter some problems. This is because airflow by default will try to load in example dags where some of them requires dependencies ``gcp`` and ``postgres``. You can solve the problem by: @@ -141,12 +158,44 @@ You can solve the problem by: Note that if you have the Breeze development environment installed, the ``breeze`` script can automate initializing the created virtualenv (steps 2 and 3). -Simply enter the Breeze environment by using ``workon`` and, once you are in it, run: +Activate your virtualenv, e.g. by using ``workon``, and once you are in it, run: .. code-block:: bash ./breeze initialize-local-virtualenv +5. (optionally) run yarn build if you plan to run the webserver + +.. code-block:: bash + + cd airflow/www + yarn build + +Developing Providers +-------------------- + +In Airflow 2.0 we introduced split of Apache Airflow into separate packages - there is one main +apache-airflow package with core of Airflow and 70+ packages for all providers (external services +and software Airflow can communicate with). + +Developing providers is part of Airflow development, but when you install airflow as editable in your local +development environment, the corresponding provider packages will be also installed from PyPI. However, the +providers will also be present in your "airflow/providers" folder. This might lead to confusion, +which sources of providers are imported during development. It will depend on your +environment's PYTHONPATH setting in general. + +In order to avoid the confusion, you can set ``INSTALL_PROVIDERS_FROM_SOURCES`` environment to ``true`` +before running ``pip install`` command: + +.. code-block:: bash + + INSTALL_PROVIDERS_FROM_SOURCES="true" pip install -U -e ".[devel,]" \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-master/constraints-3.6.txt" + +This way no providers packages will be installed and they will always be imported from the "airflow/providers" +folder. + + Running Tests ------------- diff --git a/MANIFEST.in b/MANIFEST.in index e4afd000bd415..dae0f3efef4ad 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -27,7 +27,6 @@ graft airflow/www_rbac graft airflow/www_rbac/static graft airflow/www_rbac/templates graft airflow/www_rbac/translations -graft airflow/_vendor/ include airflow/alembic.ini include airflow/git_version include airflow/serialization/schema.json diff --git a/PULL_REQUEST_WORKFLOW.rst b/PULL_REQUEST_WORKFLOW.rst new file mode 100644 index 0000000000000..c9cc6bf414830 --- /dev/null +++ b/PULL_REQUEST_WORKFLOW.rst @@ -0,0 +1,260 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. contents:: :local: + +Why non-standard pull request workflow? +--------------------------------------- + +This document describes the Pull Request Workflow we've implemented in Airflow. The workflow is slightly +more complex than regular workflow you might encounter in most of the projects because after experiencing +some huge delays in processing queues in October 2020 with GitHub Actions, we've decided to optimize the +workflow to minimize the use of Github Actions build time by utilising selective approach on which tests +and checks in the CI system are run depending on analysis of which files changed in the incoming PR and +allowing the Committers to control the scope of the tests during the approval/review process. + +Just to give a bit of context, we started off with the approach that we always run all tests for all the +incoming PRs, however due to our matrix of tests growing, this approach did not scale with the increasing +number of PRs and when we had to compete with other Apache Software Foundation projects for the 180 +slots that are available for the whole organization. More Apache Software Foundation projects started +to use GitHub Actions and we've started to experience long queues when our jobs waited for free slots. + +We approached the problem by: + +1) Improving mechanism of cancelling duplicate workflow runs more efficiently in case of queue conditions + (duplicate workflow runs are generated when someone pushes a fixup quickly - leading to running both + out-dated and current run to completion, taking precious slots. This has been implemented by improving + `cancel-workflow-run `_ action we are using. In version + 4.1 it got a new feature of cancelling all duplicates even if there is a long queue of builds. + +2) Heavily decreasing strain on the Github Actions jobs by introducing selective checks - mechanism + to control which parts of the tests are run during the tests. This is implemented by the + ``scripts/ci/selective_ci_checks.sh`` script in our repository. This script analyses which part of the + code has changed and based on that it sets the right outputs that control which tests are executed in + the CI build, and whether we need to build CI images necessary to run those steps. This allowed to + heavily decrease the strain especially for the Pull Requests that were not touching code (in which case + the builds can complete in < 2 minutes) but also by limiting the number of tests executed in PRs that do + not touch the "core" of Airflow, or only touching some - standalone - parts of Airflow such as + "Providers", "WWW" or "CLI". This solution is not yet perfect as there are likely some edge cases but + it is easy to maintain and we have an escape-hatch - all the tests are always executed in master pushes, + so contributors can easily spot if there is a "missed" case and fix it - both by fixing the problem and + adding those exceptions to the code. More about it can be found in the + `Selective CI checks <#selective-ci-checks>`_ chapter. + +3) Even more optimisation came from limiting the scope of tests to only "default" matrix parameters. So far + in Airflow we always run all tests for all matrix combinations. The primary matrix components are: + + * Python versions (currently 3.6, 3.7, 3.8) + * Backend types (currently MySQL/Postgres) + * Backed version (currently MySQL 5.7, MySQL 8, Postgres 9.6, Postgres 13 + + We've decided that instead of running all the combinations of parameters for all matrix component we will + only run default values (Python 3.6, Mysql 5.7, Postgres 9.6) for all PRs which are not approved yet by + the committers. This has a nice effect, that full set of tests (though with limited combinations of + the matrix) are still run in the CI for every Pull Request that needs tests at all - allowing the + contributors to make sure that their PR is "good enough" to be reviewed. + + Even after approval, the automated workflows we've implemented, check if the PR seems to need + "full test matrix" and provide helpful information to both contributors and committers in the form of + explanatory comments and labels set automatically showing the status of the PR. Committers have still + control whether they want to merge such requests automatically or ask for rebase or re-run the tests + and run "full tests" by applying the "full tests needed" label and re-running such request. + The "full tests needed" label is also applied automatically after approval when the change touches + the "core" of Airflow - also a separate check is added to the PR so that the "merge" button status + will indicate to the committer that full tests are still needed. The committer might still decide, + whether to merge such PR without the "full matrix". The "escape hatch" we have - i.e. running the full + matrix of tests in the "merge push" will enable committers to catch and fix such problems quickly. + More about it can be found in `Approval workflow and Matrix tests <#approval-workflow-and-matrix-tests>`_ + chapter. + +4) We've also applied (and received) funds to run self-hosted runners. This is not yet implemented, due to + discussions about security of self-hosted runners for public repositories. Running self-hosted runners by + public repositories is currently (as of end of October 2020) + `Discouraged by GitHub `_ + and we are working on solving the problem - also involving Apache Software Foundation infrastructure team. + This document does not describe this part of the approach. Most likely we will add soon a document + describing details of the approach taken there. + +Selective CI Checks +------------------- + +In order to optimise our CI builds, we've implemented optimisations to only run selected checks for some +kind of changes. The logic implemented reflects the internal architecture of Airflow 2.0 packages +and it helps to keep down both the usage of jobs in GitHub Actions as well as CI feedback time to +contributors in case of simpler changes. + +We have the following test types (separated by packages in which they are): + +* Always - those are tests that should be always executed (always folder) +* Core - for the core Airflow functionality (core folder) +* API - Tests for the Airflow API (api and api_connexion folders) +* CLI - Tests for the Airflow CLI (cli folder) +* WWW - Tests for the Airflow webserver (www and www_rbac in 1.10 folders) +* Providers - Tests for all Providers of Airflow (providers folder) +* Other - all other tests (all other folders that are not part of any of the above) + +We also have several special kinds of tests that are not separated by packages but they are marked with +pytest markers. They can be found in any of those packages and they can be selected by the appropriate +pylint custom command line options. See `TESTING.rst `_ for details but those are: + +* Integration - tests that require external integration images running in docker-compose +* Heisentests - tests that are vulnerable to some side effects and are better to be run on their own +* Quarantined - tests that are flaky and need to be fixed +* Postgres - tests that require Postgres database. They are only run when backend is Postgres +* MySQL - tests that require MySQL database. They are only run when backend is MySQL + +Even if the types are separated, In case they share the same backend version/python version, they are +run sequentially in the same job, on the same CI machine. Each of them in a separate ``docker run`` command +and with additional docker cleaning between the steps to not fall into the trap of exceeding resource +usage in one big test run, but also not to increase the number of jobs per each Pull Request. + +The logic implemented for the changes works as follows: + +1) In case of direct push (so when PR gets merged) or scheduled run, we always run all tests and checks. + This is in order to make sure that the merge did not miss anything important. The remainder of the logic + is executed only in case of Pull Requests. + +2) We retrieve which files have changed in the incoming Merge Commit (github.sha is a merge commit + automatically prepared by GitHub in case of Pull Request, so we can retrieve the list of changed + files from that commit directly). + +3) If any of the important, environment files changed (Dockerfile, ci scripts, setup.py, GitHub workflow + files), then we again run all tests and checks. Those are cases where the logic of the checks changed + or the environment for the checks changed so we want to make sure to check everything. + +4) If any of docs changed: we need to have CI image so we enable image building + +5) If any of chart files changed, we need to run helm tests so we enable helm unit tests + +6) If any of API files changed, we need to run API tests so we enable them + +7) If any of the relevant source files that trigger the tests have changed at all. Those are airflow + sources, chart, tests and kubernetes_tests. If any of those files changed, we enable tests and we + enable image building, because the CI images are needed to run tests. + +8) Then we determine which types of the tests should be run. We count all the changed files in the + relevant airflow sources (airflow, chart, tests, kubernetes_tests) first and then we count how many + files changed in different packages: + + a) in any case tests in ``Always`` folder are run. Those are special tests that should be run any time + modifications to any Python code occurs. Example test of this type is verifying proper structure of + the project including proper naming of all files. + b) if any of the Airflow API files changed we enable ``API`` test type + c) if any of the Airflow CLI files changed we enable ``CLI`` test type + d) if any of the Provider files changed we enable ``Providers`` test type + e) if any of the WWW files changed we enable ``WWW`` test type + f) if any of the Kubernetes files changed we enable ``Kubernetes`` test type + g) Then we subtract count of all the ``specific`` above per-type changed files from the count of + all changed files. In case there are any files changed, then we assume that some unknown files + changed (likely from the core of airflow) and in this case we enable all test types above and the + Core test types - simply because we do not want to risk to miss anything. + h) In all cases where tests are enabled we also add Heisentests, Integration and - depending on + the backend used = Postgres or MySQL types of tests. + +9) Quarantined tests are always run when tests are run - we need to run them often to observe how + often they fail so that we can decide to move them out of quarantine. Details about the + Quarantined tests are described in `TESTING.rst `_ + +10) There is a special case of static checks. In case the above logic determines that the CI image + needs to be build, we run long and more comprehensive version of static checks - including Pylint, + MyPy, Flake8. And those tests are run on all files, no matter how many files changed. + In case the image is not built, we run only simpler set of changes - the longer static checks + that require CI image are skipped, and we only run the tests on the files that changed in the incoming + commit - unlike pylint/flake8/mypy, those static checks are per-file based and they should not miss any + important change. + +Similarly to selective tests we also run selective security scans. In Pull requests, +the Python scan will only run when there is a python code change and JavaScript scan will only run if +there is a JavaScript or yarn.lock file change. For master builds, all scans are always executed. + +The selective check algorithm is shown here: + +.. image:: images/pr/selective_checks.png + :align: center + :alt: Selective check algorithm + +Approval Workflow and Matrix tests +---------------------------------- + +As explained above the approval and matrix tests workflow works according to the algorithm below: + +1) In case of "no-code" changes - so changes that do not change any of the code or environment of + the application, no test are run (this is done via selective checks above). Also no CI/PROD images are + build saving extra minutes. Such build takes less than 2 minutes currently and only few jobs are run + which is a very small fraction of the "full build" time. + +2) When new PR is created, only a "default set" of matrix test are running. Only default + values for each of the parameters are used effectively limiting it to running matrix builds for only + one python version and one version of each of the backends. In this case only one CI and one PROD + image is built, saving precious job slots. This build takes around 50% less time than the "full matrix" + build. + +3) When such PR gets approved, the system further analyses the files changed in this PR and further + decision is made that should be communicated to both Committer and Reviewer. + +3a) In case of "no-code" builds, a message is communicated that the PR is ready to be merged and + no tests are needed. + +.. image:: images/pr/pr-no-tests-needed-comment.png + :align: center + :alt: No tests needed for "no-code" builds + +3b) In case of "non-core" builds a message is communicated that such PR is likely OK to be merged as is with + limited set of tests, but that the committer might decide to re-run the PR after applying + "full tests needed" label, which will trigger full matrix build for tests for this PR. The committer + might make further decision on what to do with this PR. + +.. image:: images/pr/pr-likely-ok-to-merge.png + :align: center + :alt: Likely ok to merge the PR with only small set of tests + +3c) In case of "core" builds (i. e. when the PR touches some "core" part of Airflow) a message is + communicated that this PR needs "full test matrix", the "full tests needed" label is applied + automatically and either the contributor might rebase the request to trigger full test build or the + committer might re-run the build manually to trigger such full test rebuild. Also a check "in-progress" + is added, so that the committer realises that the PR is not yet "green to merge". Pull requests with + "full tests needed" label always trigger the full matrix build when rebased or re-run so if the + PR gets rebased, it will continue triggering full matrix build. + +.. image:: images/pr/pr-full-tests-needed.png + :align: center + :alt: Full tests are needed for the PR + +4) If this or another committer "request changes" in in a previously approved PR with "full tests needed" + label, the bot automatically removes the label, moving it back to "run only default set of parameters" + mode. For PRs touching core of airflow once the PR gets approved back, the label will be restored. + If it was manually set by the committer, it has to be restored manually. + +.. note:: Note that setting the labels and adding comments might be delayed, due to limitation of Github Actions, + in case of queues, processing of Pull Request reviews might take some time, so it is advised not to merge + PR immediately after approval. Luckily, the comments describing the status of the PR trigger notifications + for the PRs and they provide good "notification" for the committer to act on a PR that was recently + approved. + +The PR approval workflow is possible thanks two two custom Github Actions we've developed: + +* `Get workflow origin `_ +* `Label when approved `_ + + +Next steps +---------- + +We are planning to also propose the approach to other projects from Apache Software Foundation to +make it a common approach, so that our effort is not limited only to one project. + +Discussion about it in `this discussion `_ diff --git a/README.md b/README.md index 136193c36dff6..cf9e4697d5e68 100644 --- a/README.md +++ b/README.md @@ -19,19 +19,20 @@ # Apache Airflow [![PyPI version](https://badge.fury.io/py/apache-airflow.svg)](https://badge.fury.io/py/apache-airflow) -[![Build Status](https://travis-ci.org/apache/airflow.svg?branch=master)](https://travis-ci.org/apache/airflow) +[![GitHub Build](https://github.com/apache/airflow/workflows/CI%20Build/badge.svg)](https://github.com/apache/airflow/actions) [![Coverage Status](https://img.shields.io/codecov/c/github/apache/airflow/master.svg)](https://codecov.io/github/apache/airflow?branch=master) [![Documentation Status](https://readthedocs.org/projects/airflow/badge/?version=latest)](https://airflow.readthedocs.io/en/latest/?badge=latest) [![License](http://img.shields.io/:license-Apache%202-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.txt) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/apache-airflow.svg)](https://pypi.org/project/apache-airflow/) -[![Twitter Follow](https://img.shields.io/twitter/follow/ApacheAirflow.svg?style=social&label=Follow)](https://twitter.com/ApacheAirflow) +[![Docker Pulls](https://img.shields.io/docker/pulls/apache/airflow.svg)](https://hub.docker.com/r/apache/airflow) +[![Docker Stars](https://img.shields.io/docker/stars/apache/airflow.svg)](https://hub.docker.com/r/apache/airflow) -_NOTE: The transition from 1.8.0 (or before) to 1.8.1 (or after) requires uninstalling Apache Airflow before installing the new version. The package name was changed from `airflow` to `apache-airflow` as of version 1.8.1._ +[![Twitter Follow](https://img.shields.io/twitter/follow/ApacheAirflow.svg?style=social&label=Follow)](https://twitter.com/ApacheAirflow) +[![Slack Status](https://img.shields.io/badge/slack-join_chat-white.svg?logo=slack&style=social)](https://s.apache.org/airflow-slack) -Apache Airflow (or simply Airflow) is a platform to programmatically author, schedule, and monitor workflows. +[Apache Airflow](https://airflow.apache.org/docs/stable/) (or simply Airflow) is a platform to programmatically author, schedule, and monitor workflows. -When workflows are defined as code, they become more maintainable, -versionable, testable, and collaborative. +When workflows are defined as code, they become more maintainable, versionable, testable, and collaborative. Use Airflow to author workflows as directed acyclic graphs (DAGs) of tasks. The Airflow scheduler executes your tasks on an array of workers while following the specified dependencies. Rich command line utilities make performing complex surgeries on DAGs a snap. The rich user interface makes it easy to visualize pipelines running in production, monitor progress, and troubleshoot issues when needed. @@ -45,18 +46,20 @@ Use Airflow to author workflows as directed acyclic graphs (DAGs) of tasks. The - [运行](#运行) - [i18n](#i18n) - [打包airflow](#打包airflow) + - [Project Focus](#project-focus) + - [Principles](#principles) - [Requirements](#requirements) - - [Master version (2.0.0dev)](#master-version-200dev) - - [Stable version (1.10.9)](#stable-version-1109) + - [Additional notes on Python version requirements](#additional-notes-on-python-version-requirements) - [Getting started](#getting-started) - - [Beyond the Horizon](#beyond-the-horizon) - - [Principles](#principles) + - [Installing from PyPI](#installing-from-pypi) + - [Official source code](#official-source-code) + - [Convenience packages](#convenience-packages) - [User Interface](#user-interface) - - [Using hooks and Operators from "master" in Airflow 1.10](#using-hooks-and-operators-from-master-in-airflow-110) - [Contributing](#contributing) - - [Who uses Airflow?](#who-uses-airflow) + - [Who uses Apache Airflow?](#who-uses-apache-airflow) - [Who Maintains Apache Airflow?](#who-maintains-apache-airflow) - [Can I use the Apache Airflow logo in my presentation?](#can-i-use-the-apache-airflow-logo-in-my-presentation) + - [Airflow merchandise](#airflow-merchandise) - [Links](#links) @@ -100,293 +103,168 @@ Use Airflow to author workflows as directed acyclic graphs (DAGs) of tasks. The `python setup.py sdist bdist_wheel` +## Project Focus + +Airflow works best with workflows that are mostly static and slowly changing. When DAG structure is similar from one run to the next, it allows for clarity around unit of work and continuity. Other similar projects include [Luigi](https://github.com/spotify/luigi), [Oozie](https://oozie.apache.org/) and [Azkaban](https://azkaban.github.io/). + +Airflow is commonly used to process data, but has the opinion that tasks should ideally be idempotent (i.e. results of the task will be the same, and will not create duplicated data in a destination system), and should not pass large quantities of data from one task to the next (though tasks can pass metadata using Airflow's [Xcom feature](https://airflow.apache.org/docs/stable/concepts.html#xcoms)). For high-volume, data-intensive tasks, a best practice is to delegate to external services that specialize on that type of work. + +Airflow is not a streaming solution, but it is often used to process real-time data, pulling data off streams in batches. + +## Principles + +- **Dynamic**: Airflow pipelines are configuration as code (Python), allowing for dynamic pipeline generation. This allows for writing code that instantiates pipelines dynamically. +- **Extensible**: Easily define your own operators, executors and extend the library so that it fits the level of abstraction that suits your environment. +- **Elegant**: Airflow pipelines are lean and explicit. Parameterizing your scripts is built into the core of Airflow using the powerful **Jinja** templating engine. +- **Scalable**: Airflow has a modular architecture and uses a message queue to orchestrate an arbitrary number of workers. + ## Requirements Apache Airflow is tested with: -### Master version (2.0.0dev) +| | Master version (2.1.0dev) | Stable version (1.10.15) | +| ------------ | ------------------------- | ------------------------ | +| Python | 3.6, 3.7, 3.8 | 2.7, 3.5, 3.6, 3.7, 3.8 | +| PostgreSQL | 9.6, 10, 11, 12, 13 | 9.6, 10, 11, 12, 13 | +| MySQL | 5.7, 8 | 5.6, 5.7 | +| SQLite | latest stable | latest stable | +| Kubernetes | 1.16.9, 1.17.5, 1.18.6 | 1.16.9, 1.17.5, 1.18.6 | -* Python versions: 3.6, 3.7 -* Postgres DB: 9.6, 10 -* MySQL DB: 5.7 -* Sqlite - latest stable (it is used mainly for development purpose) +**Note:** MariaDB and MySQL 5.x are unable to or have limitations with +running multiple schedulers -- please see the "Scheduler" docs. -### Stable version (1.10.9) +**Note:** SQLite is used in Airflow tests. Do not use it in production. -* Python versions: 2.7, 3.5, 3.6, 3.7 -* Postgres DB: 9.6, 10 -* MySQL DB: 5.6, 5.7 -* Sqlite - latest stable (it is used mainly for development purpose) +### Additional notes on Python version requirements + +* Stable version [requires](https://github.com/apache/airflow/issues/8162) at least Python 3.5.3 when using Python 3 ## Getting started -Please visit the Airflow Platform documentation (latest **stable** release) for help with [installing Airflow](https://airflow.apache.org/installation.html), getting a [quick start](https://airflow.apache.org/start.html), or a more complete [tutorial](https://airflow.apache.org/tutorial.html). -Documentation of GitHub master (latest development branch): [ReadTheDocs Documentation](https://airflow.readthedocs.io/en/latest/) +Visit the official Airflow website documentation (latest **stable** release) for help with [installing Airflow](https://airflow.apache.org/installation.html), [getting started](https://airflow.apache.org/start.html), or walking through a more complete [tutorial](https://airflow.apache.org/tutorial.html). -For further information, please visit the [Airflow Wiki](https://cwiki.apache.org/confluence/display/AIRFLOW/Airflow+Home). +> Note: If you're looking for documentation for master branch (latest development branch): you can find it on [s.apache.org/airflow-docs](https://s.apache.org/airflow-docs/). -Official container (Docker) images for Apache Airflow are described in [IMAGES.rst](IMAGES.rst). +For more information on Airflow's Roadmap or Airflow Improvement Proposals (AIPs), visit the [Airflow Wiki](https://cwiki.apache.org/confluence/display/AIRFLOW/Airflow+Home). -## Beyond the Horizon +Official Docker (container) images for Apache Airflow are described in [IMAGES.rst](IMAGES.rst). -Airflow **is not** a data streaming solution. Tasks do not move data from -one to the other (though tasks can exchange metadata!). Airflow is not -in the [Spark Streaming](http://spark.apache.org/streaming/) -or [Storm](https://storm.apache.org/) space, it is more comparable to -[Oozie](http://oozie.apache.org/) or -[Azkaban](https://azkaban.github.io/). +## Installing from PyPI -Workflows are expected to be mostly static or slowly changing. You can think -of the structure of the tasks in your workflow as slightly more dynamic -than a database structure would be. Airflow workflows are expected to look -similar from a run to the next, this allows for clarity around -unit of work and continuity. +We publish Apache Airflow as `apache-airflow` package in PyPI. Installing it however might be sometimes tricky +because Airflow is a bit of both a library and application. Libraries usually keep their dependencies open and +applications usually pin them, but we should do neither and both at the same time. We decided to keep +our dependencies as open as possible (in `setup.py`) so users can install different versions of libraries +if needed. This means that from time to time plain `pip install apache-airflow` will not work or will +produce unusable Airflow installation. -## Principles +In order to have repeatable installation, however, introduced in **Airflow 1.10.10** and updated in +**Airflow 1.10.12** we also keep a set of "known-to-be-working" constraint files in the +orphan `constraints-master` and `constraints-1-10` branches. We keep those "known-to-be-working" +constraints files separately per major/minor python version. +You can use them as constraint files when installing Airflow from PyPI. Note that you have to specify +correct Airflow tag/version/branch and python versions in the URL. -- **Dynamic**: Airflow pipelines are configuration as code (Python), allowing for dynamic pipeline generation. This allows for writing code that instantiates pipelines dynamically. -- **Extensible**: Easily define your own operators, executors and extend the library so that it fits the level of abstraction that suits your environment. -- **Elegant**: Airflow pipelines are lean and explicit. Parameterizing your scripts is built into the core of Airflow using the powerful **Jinja** templating engine. -- **Scalable**: Airflow has a modular architecture and uses a message queue to orchestrate an arbitrary number of workers. +1. Installing just Airflow: + +NOTE!!! + +On November 2020, new version of PIP (20.3) has been released with a new, 2020 resolver. This resolver +does not yet work with Apache Airflow and might leads to errors in installation - depends on your choice +of extras. In order to install Airflow you need to either downgrade pip to version 20.2.4 +`pip upgrade --pip==20.2.4` or, in case you use Pip 20.3, you need to add option +`--use-deprecated legacy-resolver` to your pip install command. + + +```bash +pip install apache-airflow==1.10.15 \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-1.10.15/constraints-3.7.txt" +``` + +**NOTE!!!** + +On 30th of November 2020, new version of PIP (20.3) has been released with a new, 2020 resolver. +This resolver does not yet work with Apache Airflow and might leads to errors in installation - +depends on your choice of extras. In order to install Airflow you need to either downgrade +pip to version 20.2.4 `pip upgrade --pip==20.2.4` or, in case you use Pip 20.3, you need to add option +`--use-deprecated legacy-resolver` to your pip install command. + +```bash +pip install apache-airflow[postgres,google]==1.10.15 \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-1.10.15/constraints-3.7.txt" +``` + +For information on installing backport providers check [/docs/backport-providers.rst][/docs/backport-providers.rst]. + +## Official source code + +Apache Airflow is an [Apache Software Foundation](http://www.apache.org) (ASF) project, +and our official source code releases: + +- Follow the [ASF Release Policy](http://www.apache.org/legal/release-policy.html) +- Can be downloaded from [the ASF Distribution Directory](https://downloads.apache.org/airflow) +- Are cryptographically signed by the release manager +- Are officially voted on by the PMC members during the + [Release Approval Process](http://www.apache.org/legal/release-policy.html#release-approval) + +Following the ASF rules, the source packages released must be sufficient for a user to build and test the +release provided they have access to the appropriate platform and tools. + +## Convenience packages + +There are other ways of installing and using Airflow. Those are "convenience" methods - they are +not "official releases" as stated by the `ASF Release Policy`, but they can be used by the users +who do not want to build the software themselves. + +Those are - in the order of most common ways people install Airflow: + +- [PyPI releases](https://pypi.org/project/apache-airflow/) to install Airflow using standard `pip` tool +- [Docker Images](https://hub.docker.com/repository/docker/apache/airflow) to install airflow via + `docker` tool, use them in Kubernetes, Helm Charts, `docker-compose`, `docker swarm` etc. You can + read more about using, customising, and extending the images in the + [Latest docs](https://airflow.apache.org/docs/apache-airflow/stable/production-deployment.html), and + learn details on the internals in the [IMAGES.rst](IMAGES.rst) document. +- [Tags in GitHub](https://github.com/apache/airflow/tags) to retrieve the git project sources that + were used to generate official source packages via git + +All those artifacts are not official releases, but they are prepared using officially released sources. +Some of those artifacts are "development" or "pre-release" ones, and they are clearly marked as such +following the ASF Policy. ## User Interface - **DAGs**: Overview of all DAGs in your environment. -![](/docs/img/dags.png) -- **Tree View**: Tree representation of a DAG that spans across time. -![](/docs/img/tree.png) + ![](/docs/img/dags.png) -- **Graph View**: Visualization of a DAG's dependencies and their current status for a specific run. -![](/docs/img/graph.png) +- **Tree View**: Tree representation of a DAG that spans across time. -- **Task Duration**: Total time spent on different tasks over time. -![](/docs/img/duration.png) + ![](/docs/img/tree.png) -- **Gantt View**: Duration and overlap of a DAG. -![](/docs/img/gantt.png) +- **Graph View**: Visualization of a DAG's dependencies and their current status for a specific run. -- **Code View**: Quick way to view source code of a DAG. -![](/docs/img/code.png) + ![](/docs/img/graph.png) +- **Task Duration**: Total time spent on different tasks over time. -## Using hooks and Operators from "master" in Airflow 1.10 + ![](/docs/img/duration.png) -Currently stable versions of Apache Airflow are released in 1.10.* series. We are working on the -future, major version of Airflow from the 2.0.* series. It is going to be released in -in 2020. However the exact time of release depends on many factors and is yet unknown. -We have already a lot of changes in the hooks/operators/sensors for many external systems -and they are not used because they are part of the master/2.0 release. +- **Gantt View**: Duration and overlap of a DAG. -In the Airflow 2.0 - following AIP-21 "change in import paths" all the non-core operators/hooks/sensors -of Apache Airflow have been moved to the "airflow.providers" package. This opened a possibility to -use the operators from Airflow 2.0 in Airflow 1.10 - with the constraint that those -packages can only be used in python3.6+ environment. + ![](/docs/img/gantt.png) -Therefore we decided to prepare and release backport packages that can be installed -for older Airflow versions. Those backport packages are released more frequently. Users do not -have to upgrade their Airflow version to use those packages. There are a number of changes -between Airflow 2.0 and 1.10.* - documented in [UPDATING.md](UPDATING.md). With backported -providers package users can migrate their DAGs to the new providers package incrementally -and once they convert to the new operators/sensors/hooks they can seamlessly migrate their -environments to Airflow 2.0. +- **Code View**: Quick way to view source code of a DAG. -More information about the status and releases of the back-ported packages are available -at [Backported providers package page](https://cwiki.apache.org/confluence/display/AIRFLOW/Backported+providers+packages+for+Airflow+1.10.*+series) + ![](/docs/img/code.png) -Dependencies between packages are stored in ``airflow/providers/dependencies.json``. See -[CONTRIBUTING.rst](https://github.com/apache/airflow/blob/master/CONTRIBUTING.rst#backport-providers-packages) ## Contributing Want to help build Apache Airflow? Check out our [contributing documentation](https://github.com/apache/airflow/blob/master/CONTRIBUTING.rst). +## Who uses Apache Airflow? -## Who uses Airflow? - -As the Apache Airflow community grows, we'd like to keep track of who is using -the platform. Please send a PR with your company name and @githubhandle -if you may. - -Committers: - -* Refer to [Committers](https://cwiki.apache.org/confluence/display/AIRFLOW/Committers) - -Currently **officially** using Airflow: - -1. [AdBOOST](https://www.adboost.sk) [[AdBOOST](https://github.com/AdBOOST)] -1. [Agari](https://github.com/agaridata) [[@r39132](https://github.com/r39132)] -1. [Airbnb](http://airbnb.io/) [[@mistercrunch](https://github.com/mistercrunch), [@artwr](https://github.com/artwr)] -1. [Airtel](https://www.airtel.in/) [[@harishbisht](https://github.com/harishbisht)] -1. [Alan](https://alan.eu) [[@charles-go](https://github.com/charles-go)] -1. [allegro.pl](http://allegro.tech/) [[@kretes](https://github.com/kretes)] -1. [AltX](https://www.getaltx.com/about) [[@pedromduarte](https://github.com/pedromduarte)] -1. [Apigee](https://apigee.com) [[@btallman](https://github.com/btallman)] -1. [ARGO Labs](http://www.argolabs.org) [[California Data Collaborative](https://github.com/California-Data-Collaborative)] -1. [Astronomer](http://www.astronomer.io) [[@schnie](https://github.com/schnie), [@andscoop](https://github.com/andscoop), [@tedmiston](https://github.com/tedmiston), [@benjamingregory](https://github.com/benjamingregory)] -1. [Auth0](https://auth0.com) [[@sicarul](https://github.com/sicarul)] -1. [Away](https://awaytravel.com) [[@trunsky](https://github.com/trunsky)] -1. [Azri Solutions](http://www.azrisolutions.com/) [[@userimack](https://github.com/userimack)] -1. [BalanceHero](http://truebalance.io/) [[@swalloow](https://github.com/swalloow)] -1. [Banco de Formaturas](https://www.bancodeformaturas.com.br) [[@guiligan](https://github.com/guiligan)] -1. [BandwidthX](http://www.bandwidthx.com) [[@dineshdsharma](https://github.com/dineshdsharma)] -1. [Bellhops](https://github.com/bellhops) -1. [BelugaDB](https://belugadb.com) [[@fabio-nukui](https://github.com/fabio-nukui) & [@joao-sallaberry](http://github.com/joao-sallaberry) & [@lucianoviola](https://github.com/lucianoviola) & [@tmatuki](https://github.com/tmatuki)] -1. [BlaBlaCar](https://www.blablacar.com) [[@puckel](https://github.com/puckel) & [@wmorin](https://github.com/wmorin)] -1. [Bloc](https://www.bloc.io) [[@dpaola2](https://github.com/dpaola2)] -1. [Blue Yonder](http://www.blue-yonder.com) [[@blue-yonder](https://github.com/blue-yonder)] -1. [BlueApron](https://www.blueapron.com) [[@jasonjho](https://github.com/jasonjho) & [@matthewdavidhauser](https://github.com/matthewdavidhauser)] -1. [Bluecore](https://www.bluecore.com) [[@JLDLaughlin](https://github.com/JLDLaughlin)] -1. [Boda Telecom Suite - CE](https://github.com/bodastage/bts-ce) [[@erssebaggala](https://github.com/erssebaggala), [@bodastage](https://github.com/bodastage)] -1. [Bodastage Solutions](http://bodastage.com) [[@erssebaggala](https://github.com/erssebaggala), [@bodastage](https://github.com/bodastage)] -1. [Bonnier Broadcasting](http://www.bonnierbroadcasting.com) [[@wileeam](https://github.com/wileeam)] -1. [BounceX](http://www.bouncex.com) [[@JoshFerge](https://github.com/JoshFerge), [@hudsonrio](https://github.com/hudsonrio), [@ronniekritou](https://github.com/ronniekritou)] -1. [California Data Collaborative](https://github.com/California-Data-Collaborative) powered by [ARGO Labs](http://www.argolabs.org) -1. [Carbonite](https://www.carbonite.com) [[@ajbosco](https://github.com/ajbosco)] -1. [Celect](http://www.celect.com) [[@superdosh](https://github.com/superdosh) & [@chadcelect](https://github.com/chadcelect)] -1. [Change.org](https://www.change.org) [[@change](https://github.com/change), [@vijaykramesh](https://github.com/vijaykramesh)] -1. [Checkr](https://checkr.com) [[@tongboh](https://github.com/tongboh)] -1. [Children's Hospital of Philadelphia Division of Genomic Diagnostics](http://www.chop.edu/centers-programs/division-genomic-diagnostics) [[@genomics-geek]](https://github.com/genomics-geek/) -1. [Cinimex DataLab](http://cinimex.ru) [[@kdubovikov](https://github.com/kdubovikov)] -1. [City of San Diego](http://sandiego.gov) [[@MrMaksimize](https://github.com/mrmaksimize), [@andrell81](https://github.com/andrell81) & [@arnaudvedy](https://github.com/arnaudvedy)] -1. [Clairvoyant](https://clairvoyantsoft.com) [@shekharv](https://github.com/shekharv) -1. [Clover Health](https://www.cloverhealth.com) [[@gwax](https://github.com/gwax) & [@vansivallab](https://github.com/vansivallab)] -1. [Chartboost](https://www.chartboost.com) [[@cgelman](https://github.com/cgelman) & [@dclubb](https://github.com/dclubb)] -1. [ContaAzul](https://www.contaazul.com) [[@bern4rdelli](https://github.com/bern4rdelli), [@renanleme](https://github.com/renanleme) & [@sabino](https://github.com/sabino)] -1. [Cotap](https://github.com/cotap/) [[@maraca](https://github.com/maraca) & [@richardchew](https://github.com/richardchew)] -1. [Craig@Work](https://www.craigatwork.com) -1. [Credit Karma](https://www.creditkarma.com/) [[@preete-dixit-ck](https://github.com/preete-dixit-ck) & [@harish-gaggar-ck](https://github.com/harish-gaggar-ck) & [@greg-finley-ck](https://github.com/greg-finley-ck)] -1. [CreditCards.com](https://www.creditcards.com/)[[@vmAggies](https://github.com/vmAggies) & [@jay-wallaby](https://github.com/jay-wallaby)] -1. [Creditas](https://www.creditas.com.br) [[@dcassiano](https://github.com/dcassiano)] -1. [Custom Ink](https://www.customink.com/) [[@david-dalisay](https://github.com/david-dalisay), [@dmartin11](https://github.com/dmartin11) & [@mpeteuil](https://github.com/mpeteuil)] -1. [Data Reply](https://www.datareply.co.uk/) [[@kaxil](https://github.com/kaxil)] -1. [DataFox](https://www.datafox.com/) [[@sudowork](https://github.com/sudowork)] -1. [Digital First Media](http://www.digitalfirstmedia.com/) [[@duffn](https://github.com/duffn) & [@mschmo](https://github.com/mschmo) & [@seanmuth](https://github.com/seanmuth)] -1. [DocuTAP](https://www.docutap.com/) [[@jshvrsn](https://github.com/jshvrsn) & [@lhvphan](https://github.com/lhvphan) & [@cloneluke](https://github.com/cloneluke)] -1. [Dotmodus](http://dotmodus.com) [[@dannylee12](https://github.com/dannylee12)] -1. [Drivy](https://www.drivy.com) [[@AntoineAugusti](https://github.com/AntoineAugusti)] -1. [Easy Taxi](http://www.easytaxi.com/) [[@caique-lima](https://github.com/caique-lima) & [@WesleyBatista](https://github.com/WesleyBatista) & [@diraol](https://github.com/diraol)] -1. [eRevalue](https://www.datamaran.com) [[@hamedhsn](https://github.com/hamedhsn)] -1. [evo.company](https://evo.company/) [[@orhideous](https://github.com/orhideous)] -1. [FreshBooks](https://github.com/freshbooks) [[@DinoCow](https://github.com/DinoCow)] -1. [Fundera](https://fundera.com) [[@andyxhadji](https://github.com/andyxhadji)] -1. [GameWisp](https://gamewisp.com) [[@tjbiii](https://github.com/TJBIII) & [@theryanwalls](https://github.com/theryanwalls)] -1. [Gentner Lab](http://github.com/gentnerlab) [[@neuromusic](https://github.com/neuromusic)] -1. [Glassdoor](https://github.com/Glassdoor) [[@syvineckruyk](https://github.com/syvineckruyk)] -1. [Global Fashion Group](http://global-fashion-group.com) [[@GFG](https://github.com/GFG)] -1. [GovTech GDS](https://gds-gov.tech) [[@chrissng](https://github.com/chrissng) & [@datagovsg](https://github.com/datagovsg)] -1. [Grand Rounds](https://www.grandrounds.com/) [[@richddr](https://github.com/richddr), [@timz1290](https://github.com/timz1290), [@wenever](https://github.com/@wenever), & [@runongirlrunon](https://github.com/runongirlrunon)] -1. [Groupalia](http://es.groupalia.com) [[@jesusfcr](https://github.com/jesusfcr)] -1. [Groupon](https://groupon.com) [[@stevencasey](https://github.com/stevencasey)] -1. [Gusto](https://gusto.com) [[@frankhsu](https://github.com/frankhsu)] -1. [Handshake](https://joinhandshake.com/) [[@mhickman](https://github.com/mhickman)] -1. [Handy](http://www.handy.com/careers/73115?gh_jid=73115&gh_src=o5qcxn) [[@marcintustin](https://github.com/marcintustin) / [@mtustin-handy](https://github.com/mtustin-handy)] -1. [HBC Digital](http://tech.hbc.com) [[@tmccartan](https://github.com/tmccartan) & [@dmateusp](https://github.com/dmateusp)] -1. [HBO](http://www.hbo.com/)[[@yiwang](https://github.com/yiwang)] -1. [Healthjump](http://www.healthjump.com/) [[@miscbits](https://github.com/miscbits)] -1. [HelloFresh](https://www.hellofresh.com) [[@tammymendt](https://github.com/tammymendt) & [@davidsbatista](https://github.com/davidsbatista) & [@iuriinedostup](https://github.com/iuriinedostup)] -1. [Holimetrix](http://holimetrix.com/) [[@thibault-ketterer](https://github.com/thibault-ketterer)] -1. [Hootsuite](https://github.com/hootsuite) -1. [Hostnfly](https://www.hostnfly.com/) [[@CyrilLeMat](https://github.com/CyrilLeMat) & [@pierrechopin](https://github.com/pierrechopin) & [@alexisrosuel](https://github.com/alexisrosuel)] -1. [HotelQuickly](https://github.com/HotelQuickly) [[@zinuzoid](https://github.com/zinuzoid)] -1. [IFTTT](https://www.ifttt.com/) [[@apurvajoshi](https://github.com/apurvajoshi)] -1. [iHeartRadio](http://www.iheart.com/)[[@yiwang](https://github.com/yiwang)] -1. [imgix](https://www.imgix.com/) [[@dclubb](https://github.com/dclubb)] -1. [ING](http://www.ing.com/) -1. [Intercom](http://www.intercom.com/) [[@fox](https://github.com/fox) & [@paulvic](https://github.com/paulvic)] -1. [Investorise](https://investorise.com/) [[@svenvarkel](https://github.com/svenvarkel)] -1. [Jampp](https://github.com/jampp) -1. [JobTeaser](https://www.jobteaser.com) [[@stefani75](https://github.com/stefani75) & [@knil-sama](https://github.com/knil-sama)] -1. [Kalibrr](https://www.kalibrr.com/) [[@charlesverdad](https://github.com/charlesverdad)] -1. [Karmic](https://karmiclabs.com) [[@hyw](https://github.com/hyw)] -1. [Kiwi.com](https://kiwi.com/) [[@underyx](https://github.com/underyx)] -1. [Kogan.com](https://github.com/kogan) [[@geeknam](https://github.com/geeknam)] -1. [Lemann Foundation](http://fundacaolemann.org.br) [[@fernandosjp](https://github.com/fernandosjp)] -1. [LendUp](https://www.lendup.com/) [[@lendup](https://github.com/lendup)] -1. [LetsBonus](http://www.letsbonus.com) [[@jesusfcr](https://github.com/jesusfcr) & [@OpringaoDoTurno](https://github.com/OpringaoDoTurno)] -1. [liligo](http://liligo.com/) [[@tromika](https://github.com/tromika)] -1. [LingoChamp](http://www.liulishuo.com/) [[@haitaoyao](https://github.com/haitaoyao)] -1. [Lucid](http://luc.id) [[@jbrownlucid](https://github.com/jbrownlucid) & [@kkourtchikov](https://github.com/kkourtchikov)] -1. [Lumos Labs](https://www.lumosity.com/) [[@rfroetscher](https://github.com/rfroetscher/) & [@zzztimbo](https://github.com/zzztimbo/)] -1. [Lyft](https://www.lyft.com/)[[@SaurabhBajaj](https://github.com/SaurabhBajaj)] -1. [M4U](https://www.m4u.com.br/) [[@msantino](https://github.com/msantino)] -1. [Madrone](http://madroneco.com/) [[@mbreining](https://github.com/mbreining) & [@scotthb](https://github.com/scotthb)] -1. [Markovian](https://markovian.com/) [[@al-xv](https://github.com/al-xv), [@skogsbaeck](https://github.com/skogsbaeck), [@waltherg](https://github.com/waltherg)] -1. [Mercadoni](https://www.mercadoni.com.co) [[@demorenoc](https://github.com/demorenoc)] -1. [Mercari](http://www.mercari.com/) [[@yu-iskw](https://github.com/yu-iskw)] -1. [MFG Labs](https://github.com/MfgLabs) -1. [MiNODES](https://www.minodes.com) [[@dice89](https://github.com/dice89), [@diazcelsa](https://github.com/diazcelsa)] -1. [Multiply](https://www.multiply.com) [[@nrhvyc](https://github.com/nrhvyc)] -1. [mytaxi](https://mytaxi.com) [[@mytaxi](https://github.com/mytaxi)] -1. [Nerdwallet](https://www.nerdwallet.com) -1. [New Relic](https://www.newrelic.com) [[@marcweil](https://github.com/marcweil)] -1. [Newzoo](https://www.newzoo.com) [[@newzoo-nexus](https://github.com/newzoo-nexus)] -1. [Nextdoor](https://nextdoor.com) [[@SivaPandeti](https://github.com/SivaPandeti), [@zshapiro](https://github.com/zshapiro) & [@jthomas123](https://github.com/jthomas123)] -1. [OdysseyPrime](https://www.goprime.io/) [[@davideberdin](https://github.com/davideberdin)] -1. [OfferUp](https://offerupnow.com) -1. [OneFineStay](https://www.onefinestay.com) [[@slangwald](https://github.com/slangwald)] -1. [Open Knowledge International](https://okfn.org) [@vitorbaptista](https://github.com/vitorbaptista) -1. [Overstock](https://www.github.com/overstock) [[@mhousley](https://github.com/mhousley) & [@mct0006](https://github.com/mct0006)] -1. [Pandora Media](https://www.pandora.com/) [[@Acehaidrey](https://github.com/Acehaidrey) & [@wolfier](https://github.com/wolfier)] -1. [PAYMILL](https://www.paymill.com/) [[@paymill](https://github.com/paymill) & [@matthiashuschle](https://github.com/matthiashuschle)] -1. [PayPal](https://www.paypal.com/) [[@r39132](https://github.com/r39132) & [@jhsenjaliya](https://github.com/jhsenjaliya)] -1. [Pernod-Ricard](https://www.pernod-ricard.com/) [[@romain-nio](https://github.com/romain-nio)] -1. [Plaid](https://www.plaid.com/) [[@plaid](https://github.com/plaid), [@AustinBGibbons](https://github.com/AustinBGibbons) & [@jeeyoungk](https://github.com/jeeyoungk)] -1. [Playbuzz](https://www.playbuzz.com/) [[@clintonboys](https://github.com/clintonboys) & [@dbn](https://github.com/dbn)] -1. [PMC](https://pmc.com/) [[@andrewm4894](https://github.com/andrewm4894)] -1. [Postmates](http://www.postmates.com) [[@syeoryn](https://github.com/syeoryn)] -1. [Pronto Tools](http://www.prontotools.io/) [[@zkan](https://github.com/zkan) & [@mesodiar](https://github.com/mesodiar)] -1. [PubNub](https://pubnub.com) [[@jzucker2](https://github.com/jzucker2)] -1. [Qplum](https://qplum.co) [[@manti](https://github.com/manti)] -1. [Quantopian](https://www.quantopian.com/) [[@eronarn](http://github.com/eronarn)] -1. [Qubole](https://qubole.com) [[@msumit](https://github.com/msumit)] -1. [Quizlet](https://quizlet.com) [[@quizlet](https://github.com/quizlet)] -1. [Quora](https://www.quora.com/) -1. [REA Group](https://www.rea-group.com/) -1. [Reddit](https://www.reddit.com/) [[@reddit](https://github.com/reddit/)] -1. [Robinhood](https://robinhood.com) [[@vineet-rh](https://github.com/vineet-rh)] -1. [Scaleway](https://scaleway.com) [[@kdeldycke](https://github.com/kdeldycke)] -1. [Sense360](https://github.com/Sense360) [[@kamilmroczek](https://github.com/KamilMroczek)] -1. [Shopkick](https://shopkick.com/) [[@shopkick](https://github.com/shopkick)] -1. [Sidecar](https://hello.getsidecar.com/) [[@getsidecar](https://github.com/getsidecar)] -1. [SimilarWeb](https://www.similarweb.com/) [[@similarweb](https://github.com/similarweb)] -1. [SmartNews](https://www.smartnews.com/) [[@takus](https://github.com/takus)] -1. [SocialCops](https://www.socialcops.com/) [[@vinayak-mehta](https://github.com/vinayak-mehta) & [@sharky93](https://github.com/sharky93)] -1. [Spotahome](https://www.spotahome.com/) [[@spotahome](https://github.com/spotahome)] -1. [Spotify](https://github.com/spotify) [[@znichols](https://github.com/znichols)] -1. [Stackspace](https://beta.stackspace.io/) -1. [Stripe](https://stripe.com) [[@jbalogh](https://github.com/jbalogh)] -1. [Tails.com](https://tails.com/) [[@alanmcruickshank](https://github.com/alanmcruickshank)] -1. [Thinking Machines](https://thinkingmachin.es) [[@marksteve](https://github.com/marksteve)] -1. [Thinknear](https://www.thinknear.com/) [[@d3cay1](https://github.com/d3cay1), [@ccson](https://github.com/ccson), & [@ababian](https://github.com/ababian)] -1. [Thumbtack](https://www.thumbtack.com/) [[@natekupp](https://github.com/natekupp)] -1. [Tictail](https://tictail.com/) -1. [Tile](https://tile.com/) [[@ranjanmanish](https://github.com/ranjanmanish)] -1. [Tokopedia](https://www.tokopedia.com/) [@topedmaria](https://github.com/topedmaria) -1. [Twine Labs](https://www.twinelabs.com/) [[@ivorpeles](https://github.com/ivorpeles)] -1. [Twitter](https://www.twitter.com/) [[@aoen](https://github.com/aoen)] -1. [T2 Systems](http://t2systems.com) [[@unclaimedpants](https://github.com/unclaimedpants)] -1. [Ubisoft](https://www.ubisoft.com/) [[@Walkoss](https://github.com/Walkoss)] -1. [United Airlines](https://www.united.com/) [[@ilopezfr](https://github.com/ilopezfr)] -1. [Upsight](https://www.upsight.com) [[@dhuang](https://github.com/dhuang)] -1. [Vente-Exclusive.com](http://www.vente-exclusive.com/) [[@alexvanboxel](https://github.com/alexvanboxel)] -1. [Vevo](https://www.vevo.com/) [[@csetiawan](https://github.com/csetiawan) & [@jerrygillespie](https://github.com/jerrygillespie)] -1. [Vnomics](https://github.com/vnomics) [[@lpalum](https://github.com/lpalum)] -1. [WePay](http://www.wepay.com) [[@criccomini](https://github.com/criccomini) & [@mtagle](https://github.com/mtagle)] -1. [WeTransfer](https://github.com/WeTransfer) [[@jochem](https://github.com/jochem)] -1. [Whistle Labs](http://www.whistle.com) [[@ananya77041](https://github.com/ananya77041)] -1. [WiseBanyan](https://wisebanyan.com/) -1. [Wooga](https://www.wooga.com/) -1. [Xero](https://www.xero.com/) [[@yan9yu](https://github.com/yan9yu)] -1. [Xoom](https://www.xoom.com/) -1. [Yahoo!](https://www.yahoo.com/) -1. [Yieldr](https://www.yieldr.com/) [[@ggeorgiadis](https://github.com/ggeorgiadis)] -1. [Zapier](https://www.zapier.com) [[@drknexus](https://github.com/drknexus) & [@statwonk](https://github.com/statwonk)] -1. [Zego](https://www.zego.com/) [[@ruimffl](https://github.com/ruimffl)] -1. [Zendesk](https://www.github.com/zendesk) -1. [Zenly](https://zen.ly) [[@cerisier](https://github.com/cerisier) & [@jbdalido](https://github.com/jbdalido)] -1. [Zymergen](https://www.zymergen.com/) -1. [99](https://99taxis.com) [[@fbenevides](https://github.com/fbenevides), [@gustavoamigo](https://github.com/gustavoamigo) & [@mmmaia](https://github.com/mmmaia)] +More than 350 organizations are using Apache Airflow [in the wild](https://github.com/apache/airflow/blob/master/INTHEWILD.md). ## Who Maintains Apache Airflow? @@ -394,15 +272,18 @@ Airflow is the work of the [community](https://github.com/apache/airflow/graphs/ but the [core committers/maintainers](https://people.apache.org/committers-by-project.html#airflow) are responsible for reviewing and merging PRs as well as steering conversation around new feature requests. If you would like to become a maintainer, please review the Apache Airflow -[committer requirements](https://cwiki.apache.org/confluence/display/AIRFLOW/Committers). +[committer requirements](https://airflow.apache.org/docs/stable/project.html#committers). ## Can I use the Apache Airflow logo in my presentation? Yes! Be sure to abide by the Apache Foundation [trademark policies](https://www.apache.org/foundation/marks/#books) and the Apache Airflow [Brandbook](https://cwiki.apache.org/confluence/display/AIRFLOW/Brandbook). The most up to date logos are found in [this repo](/docs/img/logos) and on the Apache Software Foundation [website](https://www.apache.org/logos/about.html). -## Links +## Airflow merchandise + +If you would love to have Apache Airflow stickers, t-shirt etc. then check out +[Redbubble Shop](https://www.redbubble.com/i/sticker/Apache-Airflow-by-comdev/40497530.EJUG5). +## Links -- [Documentation](https://airflow.apache.org/) -- [Chat](https://apache-airflow-slack.herokuapp.com/) -- [More](https://cwiki.apache.org/confluence/display/AIRFLOW/Airflow+Links) +- [Documentation](https://airflow.apache.org/docs/stable/) +- [Chat](https://s.apache.org/airflow-slack) diff --git a/STATIC_CODE_CHECKS.rst b/STATIC_CODE_CHECKS.rst index ab6cc91cf645d..7f0229905205f 100644 --- a/STATIC_CODE_CHECKS.rst +++ b/STATIC_CODE_CHECKS.rst @@ -46,54 +46,116 @@ require Breeze Docker images to be installed locally: =================================== ================================================================ ============ **Hooks** **Description** **Breeze** =================================== ================================================================ ============ +``airflow-config-yaml`` Checks that airflow config yaml is 1-1 with the code +----------------------------------- ---------------------------------------------------------------- ------------ ``base-operator`` Checks that BaseOperator is imported properly ----------------------------------- ---------------------------------------------------------------- ------------ -``build`` Builds image for check-apache-licence, mypy, flake8. * +``bats-tests`` Runs BATS bash unit tests +----------------------------------- ---------------------------------------------------------------- ------------ +``build`` Builds image for mypy, flake8. * +----------------------------------- ---------------------------------------------------------------- ------------ +``bats-in-container-tests`` Run in Breeze container bats tests * +----------------------------------- ---------------------------------------------------------------- ------------ +``black`` Runs Black (the uncompromising Python code formatter) +----------------------------------- ---------------------------------------------------------------- ------------ +``build`` Builds image for mypy, pylint, flake8. * ----------------------------------- ---------------------------------------------------------------- ------------ -``check-apache-license`` Checks compatibility with Apache License requirements. * +``build-providers-dependencies`` Regenerates the json file with cross-provider dependencies +----------------------------------- ---------------------------------------------------------------- ------------ +``check-apache-license`` Checks compatibility with Apache License requirements. +----------------------------------- ---------------------------------------------------------------- ------------ +``check-builtin-literals`` Require literal syntax when initializing Python builtin types ----------------------------------- ---------------------------------------------------------------- ------------ ``check-executables-have-shebangs`` Checks that executables have shebang. ----------------------------------- ---------------------------------------------------------------- ------------ ``check-hooks-apply`` Checks which hooks are applicable to the repository. ----------------------------------- ---------------------------------------------------------------- ------------ -``check-merge-conflict`` Checks if a merge conflict is committed. +``check-hooks-apply`` Checks which hooks are applicable to the repository. +----------------------------------- ---------------------------------------------------------------- ------------ +``check-integrations`` Checks if integration list is synchronized in code. +----------------------------------- ---------------------------------------------------------------- ------------ +``check-merge-conflicts`` Checks that merge conflicts are not being committed. ----------------------------------- ---------------------------------------------------------------- ------------ ``check-xml`` Checks XML files with xmllint. ----------------------------------- ---------------------------------------------------------------- ------------ -``debug-statements`` Detects accidenatally committed debug statements. +``debug-statements`` Detects accidentally committed debug statements. ----------------------------------- ---------------------------------------------------------------- ------------ ``detect-private-key`` Detects if private key is added to the repository. ----------------------------------- ---------------------------------------------------------------- ------------ ``doctoc`` Refreshes the table of contents for md files. ----------------------------------- ---------------------------------------------------------------- ------------ +``dont-use-safe-filter`` Don't use safe in templates. +----------------------------------- ---------------------------------------------------------------- ------------ ``end-of-file-fixer`` Makes sure that there is an empty line at the end. ----------------------------------- ---------------------------------------------------------------- ------------ +``fix-encoding-pragma`` Removes encoding header from python files. +----------------------------------- ---------------------------------------------------------------- ------------ ``flake8`` Runs flake8. * ----------------------------------- ---------------------------------------------------------------- ------------ ``forbid-tabs`` Fails if tabs are used in the project. ----------------------------------- ---------------------------------------------------------------- ------------ +``helm-lint`` Verifies if helm lint passes for the chart +----------------------------------- ---------------------------------------------------------------- ------------ +``identity`` Prints inputs to the static check hooks for troubleshooting +----------------------------------- ---------------------------------------------------------------- ------------ +``incorrect-use-of-LoggingMixin`` Checks if LoggingMixin is properly imported. +----------------------------------- ---------------------------------------------------------------- ------------ ``insert-license`` Adds licenses for most file types. ----------------------------------- ---------------------------------------------------------------- ------------ ``isort`` Sorts imports in python files. ----------------------------------- ---------------------------------------------------------------- ------------ +``language-matters`` Check for language that we do not accept as community +----------------------------------- ---------------------------------------------------------------- ------------ ``lint-dockerfile`` Lints a dockerfile. ----------------------------------- ---------------------------------------------------------------- ------------ +``lint-openapi`` Lints openapi specification. +----------------------------------- ---------------------------------------------------------------- ------------ +``markdownlint`` Lints Markdown files. +----------------------------------- ---------------------------------------------------------------- ------------ +``mermaid`` Generates diagrams from mermaid files. +----------------------------------- ---------------------------------------------------------------- ------------ ``mixed-line-ending`` Detects if mixed line ending is used (\r vs. \r\n). ----------------------------------- ---------------------------------------------------------------- ------------ ``mypy`` Runs mypy. * ----------------------------------- ---------------------------------------------------------------- ------------ -``pydevd`` Check for accidentally commited pydevd statements. +``mypy-helm`` Runs mypy. * +----------------------------------- ---------------------------------------------------------------- ------------ +``pre-commit-descriptions`` Check if all pre-commits are described in docs. +----------------------------------- ---------------------------------------------------------------- ------------ +``pydevd`` Check for accidentally committed pydevd statements. +----------------------------------- ---------------------------------------------------------------- ------------ +``pydocstyle`` Runs pydocstyle. +----------------------------------- ---------------------------------------------------------------- ------------ +``python2-compile`` Check if python files compile with Python 2. +----------------------------------- ---------------------------------------------------------------- ------------ +``python2-fastcheck`` Fast grep check for common Python 3 cherry-picking problems. ----------------------------------- ---------------------------------------------------------------- ------------ ``python-no-log-warn`` Checks if there are no deprecate log warn. ----------------------------------- ---------------------------------------------------------------- ------------ +``restrict-start_date`` 'start_date' should not be in default_args in example_dags +----------------------------------- ---------------------------------------------------------------- ------------ ``rst-backticks`` Checks if RST files use double backticks for code. ----------------------------------- ---------------------------------------------------------------- ------------ ``setup-order`` Checks for an order of dependencies in setup.py ----------------------------------- ---------------------------------------------------------------- ------------ +``setup-installation`` Checks if all the libraries in setup.py are listed in docs +----------------------------------- ---------------------------------------------------------------- ------------ ``shellcheck`` Checks shell files with shellcheck. ----------------------------------- ---------------------------------------------------------------- ------------ +``sort-in-the-wild`` Sort INTHEWILD.md alphabetically. +----------------------------------- ---------------------------------------------------------------- ------------ +``trailing-whitespace`` Removes trailing whitespace at end of line. +----------------------------------- ---------------------------------------------------------------- ------------ ``update-breeze-file`` Update output of breeze command in BREEZE.rst. ----------------------------------- ---------------------------------------------------------------- ------------ +``update-extras`` Updates extras in the documentation. +----------------------------------- ---------------------------------------------------------------- ------------ +``update-local-yml-file`` Updates mounts in local.yml file. +----------------------------------- ---------------------------------------------------------------- ------------ +``update-setup-cfg-file`` Update setup.cfg file with all licenses. +----------------------------------- ---------------------------------------------------------------- ------------ +``update-extras`` Updates extras in the documentation. +----------------------------------- ---------------------------------------------------------------- ------------ ``yamllint`` Checks yaml files with yamllint. =================================== ================================================================ ============ @@ -207,9 +269,12 @@ Running Static Code Checks via Breeze The static code checks can be launched using the Breeze environment. -You run the static code checks via ``./breeze static-check`` or ``./breeze static-check-all-files`` commands. -The former ones run appropriate checks only for files changed and staged locally, the latter ones -run checks on all files. +You run the static code checks via ``./breeze static-check`` or commands. + +Note that it may take a lot of time to run checks for all files with pylint on macOS due to a slow +filesystem for macOS Docker. As a workaround, you can add their arguments after ``--`` as extra arguments. +For example ``--files`` flag. By default those checks are run only on the files you've changed in your +commit, but you can also add ``-- --all-files`` flag to run check on all files. You can see the list of available static checks either via ``--help`` flag or by using the autocomplete option. Note that the ``all`` static check runs all configured static checks. @@ -224,7 +289,7 @@ Run the ``mypy`` check for all files: .. code-block:: bash - ./breeze static-check-all-files mypy + ./breeze static-check mypy -- --all-files Run the ``flake8`` check for the ``tests.core.py`` file with verbose output: @@ -248,7 +313,7 @@ Run all tests for all files: .. code-block:: bash - ./breeze static-check-all-files all + ./breeze static-check all -- --all-files The ``license`` check is run via the same Docker image containing the @@ -257,19 +322,19 @@ It does not take pre-commit parameters as extra arguments. .. code-block:: bash - ./breeze static-check-all-files licenses + ./breeze static-check licenses Running Static Code Checks via Scripts from the Host .................................................... You can trigger the static checks from the host environment, without entering the Docker container. To do -this, run the following scripts (the same is done in Travis CI): +this, run the following scripts: -* ``_ - checks the licenses. -* ``_ - checks that documentation can be built without warnings. -* ``_ - runs Flake8 source code style enforcement tool. -* ``_ - runs lint checker for the dockerfiles. -* ``_ - runs a check for mypy type annotation consistency. +* ``_ - checks that documentation can be built without warnings. +* ``_ - checks the licenses. +* ``_ - runs Flake8 source code style enforcement tool. +* ``_ - runs lint checker for the dockerfiles. +* ``_ - runs a check for mypy type annotation consistency. The scripts may ask you to rebuild the images, if needed. @@ -285,10 +350,10 @@ Running Static Code Checks in the Docker Container If you are already in the Breeze Docker environment (by running the ``./breeze`` command), you can also run the same static checks via run_scripts: -* Mypy: ``./scripts/ci/in_container/run_mypy.sh airflow tests`` -* Flake8: ``./scripts/ci/in_container/run_flake8.sh`` -* License check: ``./scripts/ci/in_container/run_check_licence.sh`` -* Documentation: ``./scripts/ci/in_container/run_docs_build.sh`` +* Mypy: ``./scripts/in_container/run_mypy.sh airflow tests`` +* Flake8: ``./scripts/in_container/run_flake8.sh`` +* License check: ``./scripts/in_container/run_check_licence.sh`` +* Documentation: ``./scripts/in_container/run_docs_build.sh`` Running Static Code Checks for Selected Files ............................................. @@ -300,20 +365,20 @@ In the Docker container: .. code-block:: - ./scripts/ci/in_container/run_mypy.sh ./airflow/example_dags/ + ./scripts/in_container/run_mypy.sh ./airflow/example_dags/ or .. code-block:: - ./scripts/ci/in_container/run_mypy.sh ./airflow/example_dags/test_utils.py + ./scripts/in_container/run_mypy.sh ./airflow/example_dags/test_utils.py On the host: .. code-block:: - ./scripts/ci/ci_mypy.sh ./airflow/example_dags/ + ./scripts/ci/static_checks/mypy.sh ./airflow/example_dags/ .. code-block:: - ./scripts/ci/ci_mypy.sh ./airflow/example_dags/test_utils.py + ./scripts/ci/static_checks/mypy.sh ./airflow/example_dags/test_utils.py diff --git a/TESTING.rst b/TESTING.rst index 129e6c5aa20c5..ae7fd2385ff6c 100644 --- a/TESTING.rst +++ b/TESTING.rst @@ -25,13 +25,13 @@ Airflow Test Infrastructure and local virtualenv. * **Integration tests** are available in the Breeze development environment - that is also used for Airflow Travis CI tests. Integration tests are special tests that require + that is also used for Airflow CI tests. Integration tests are special tests that require additional services running, such as Postgres, MySQL, Kerberos, etc. Currently, these tests are not - marked as integration tests but soon they will be clearly separated by ``pytest`` annotations. + marked as integration tests but soon they will be separated by ``pytest`` annotations. * **System tests** are automatic tests that use external systems like Google Cloud Platform. These tests are intended for an end-to-end DAG execution. - The tests can be executed on both current version of Apache Airflow, and any of the older + The tests can be executed on both the current version of Apache Airflow and any of the older versions from 1.10.* series. This document is about running Python tests. Before the tests are run, use @@ -111,45 +111,136 @@ This can also be done by specifying a full path to the test: .. code-block:: bash - pytest tests/test_core.py::TestCore::test_check_operators + pytest tests/core/test_core.py::TestCore::test_check_operators To run the whole test class, enter: .. code-block:: bash - pytest tests/test_core.py::TestCore + pytest tests/core/test_core.py::TestCore You can use all available ``pytest`` flags. For example, to increase a log level for debugging purposes, enter: .. code-block:: bash - pytest --log-level=DEBUG tests/test_core.py::TestCore + pytest --log-level=DEBUG tests/core/test_core.py::TestCore Running Tests for a Specified Target Using Breeze from the Host --------------------------------------------------------------- If you wish to only run tests and not to drop into shell, apply the -``-t``, ``--test-target`` flag. You can add extra pytest flags after ``--`` in the command line. +``tests`` command. You can add extra targets and pytest flags after the ``--`` command. Note that +often you want to run the tests with a clean/reset db, so usually you want to add ``--db-reset`` flag +to breeze. .. code-block:: bash - ./breeze test-target tests/hooks/test_druid_hook.py -- --logging-level=DEBUG + ./breeze tests tests/hooks/test_druid_hook.py tests/tests_core.py --db-reset -- --logging-level=DEBUG -You can run the whole test suite with a special '.' test target: +You can run the whole test suite without adding the test target: .. code-block:: bash - ./breeze test-target . + ./breeze tests --db-reset You can also specify individual tests or a group of tests: .. code-block:: bash - ./breeze test-target tests/test_core.py::TestCore + ./breeze tests --db-reset tests/core/test_core.py::TestCore +Running Tests of a specified type from the Host +----------------------------------------------- + +You can also run tests for a specific test type. For the stability and performance point of view +we separated tests to different test types so that they can be run separately. + +You can select the test type by adding ``--test-type TEST_TYPE`` before the test command. There are two +kinds of test types: + +* Per-directories types are added to select subset of the tests based on sub-directories in ``tests`` folder. + Example test types there - Core, Providers, CLI. The only action that happens when you choose the right + test folders are pre-selected. For those types of tests it is only useful to choose the test type + when you do not specify test to run. + +Runs all core tests: + +.. code-block:: bash + + ./breeze --test-type Core --db-reset tests + +Runs all provider tests: + +.. code-block:: bash + + ./breeze --test-type Providers --db-reset tests + +* Special kinds of tests - Integration, Heisentests, Quarantined, Postgres, MySQL which are marked with pytest + marks and for those you need to select the type using test-type switch. If you want to run such tests + using breeze, you need to pass appropriate ``--test-type`` otherwise the test will be skipped. + Similarly to the per-directory tests if you do not specify the test or tests to run, + all tests of a given type are run + +Run quarantined test_task_command.py test: + +.. code-block:: bash + + ./breeze --test-type Quarantined tests tests/cli/commands/test_task_command.py --db-reset + +Run all Quarantined tests: + +.. code-block:: bash + + ./breeze --test-type Quarantined tests --db-reset + +Helm Unit Tests +=============== + +On the Airflow Project, we have decided to stick with Pythonic testing for our Helm chart. This makes our chart +easier to test, easier to modify, and able to run with the same testing infrastructure. To add Helm unit tests +go to the ``chart/tests`` directory and add your unit test by creating a class that extends ``unittest.TestCase`` + +.. code-block:: python + + class TestBaseChartTest(unittest.TestCase): + +To render the chart create a YAML string with the nested dictionary of options you wish to test. You can then +use our ``render_chart`` function to render the object of interest into a testable Python dictionary. Once the chart +has been rendered, you can use the ``render_k8s_object`` function to create a k8s model object that simultaneously +ensures that the object created properly conforms to the expected object spec and allows you to use object values +instead of nested dictionaries. + +Example test here: + +.. code-block:: python + + from .helm_template_generator import render_chart, render_k8s_object + + git_sync_basic = """ + dags: + gitSync: + enabled: true + """ + + + class TestGitSyncScheduler(unittest.TestCase): + + def test_basic(self): + helm_settings = yaml.safe_load(git_sync_basic) + res = render_chart('GIT-SYNC', helm_settings, + show_only=["templates/scheduler/scheduler-deployment.yaml"]) + dep: k8s.V1Deployment = render_k8s_object(res[0], k8s.V1Deployment) + self.assertEqual("dags", dep.spec.template.spec.volumes[1].name) + +To run tests using breeze run the following command + +.. code-block:: bash + + ./breeze --test-type Helm tests + Airflow Integration Tests ========================= @@ -161,7 +252,7 @@ Enabling Integrations --------------------- Airflow integration tests cannot be run in the local virtualenv. They can only run in the Breeze -environment with enabled integrations and in Travis CI. +environment with enabled integrations and in the CI. See ``_ for details about Airflow CI. When you are in the Breeze environment, by default all integrations are disabled. This enables only true unit tests to be executed in Breeze. You can enable the integration by passing the ``--integration `` @@ -245,30 +336,23 @@ require more than one integration. If such a marked test does not have a required integration enabled, it is skipped. The skip message clearly says what is needed to use the test. -To run all tests with a certain integration, use the custom pytest flag ``--integrations``, -where you can pass integrations as comma-separated values. You can also specify ``all`` to start -tests for all integrations. +To run all tests with a certain integration, use the custom pytest flag ``--integration``. +You can pass several integration flags if you want to enable several integrations at once. -**NOTE:** If an integration is not enabled in Breeze or Travis CI, +**NOTE:** If an integration is not enabled in Breeze or CI, the affected test will be skipped. To run only ``mongo`` integration tests: .. code-block:: bash - pytest --integrations mongo + pytest --integration mongo -To run integration tests fot ``mongo`` and ``rabbitmq``: +To run integration tests for ``mongo`` and ``rabbitmq``: .. code-block:: bash - pytest --integrations mongo,rabbitmq - -To runs all integration tests: - -.. code-block:: bash - - pytest --integrations all + pytest --integration mongo --integration rabbitmq Note that collecting all tests takes some time. So, if you know where your tests are located, you can speed up the test collection significantly by providing the folder where the tests are located. @@ -277,14 +361,14 @@ Here is an example of the collection limited to the ``providers/apache`` directo .. code-block:: bash - pytest --integrations cassandra tests/providers/apache/ + pytest --integration cassandra tests/providers/apache/ Running Backend-Specific Tests ------------------------------ Tests that are using a specific backend are marked with a custom pytest marker ``pytest.mark.backend``. The marker has a single parameter - the name of a backend. It corresponds to the ``--backend`` switch of -the Breeze environment (one of ``mysql``, ``sqlite``, or ``postgres``). Backen-specific tests only run when +the Breeze environment (one of ``mysql``, ``sqlite``, or ``postgres``). Backend-specific tests only run when the Breeze environment is running with the right backend. If you specify more than one backend in the marker, the test runs for all specified backends. @@ -313,164 +397,374 @@ Here is an example of running only postgres-specific backend tests: pytest --backend postgres +Running Long running tests +-------------------------- + +Some of the tests rung for a long time. Such tests are marked with ``@pytest.mark.long_running`` annotation. +Those tests are skipped by default. You can enable them with ``--include-long-running`` flag. You +can also decide to only run tests with ``-m long-running`` flags to run only those tests. + +Quarantined tests +----------------- + +Some of our tests are quarantined. This means that this test will be run in isolation and that it will be +re-run several times. Also when quarantined tests fail, the whole test suite will not fail. The quarantined +tests are usually flaky tests that need some attention and fix. + +Those tests are marked with ``@pytest.mark.quarantined`` annotation. +Those tests are skipped by default. You can enable them with ``--include-quarantined`` flag. You +can also decide to only run tests with ``-m quarantined`` flag to run only those tests. + +Heisen tests +------------ + +Some of our tests are Heisentests. This means that they run fine in isolation but when they run together with +others they might fail the tests (this is likely due to resource consumptions). Therefore we run those tests +in isolation. + +Those tests are marked with ``@pytest.mark.heisentests`` annotation. +Those tests are skipped by default. You can enable them with ``--include-heisentests`` flag. You +can also decide to only run tests with ``-m heisentests`` flag to run only those tests. + + Running Tests with Kubernetes ------------------------------ +============================= -Starting Kubernetes Cluster when Starting Breeze -................................................ +Airflow has tests that are run against real kubernetes cluster. We are using +`Kind `_ to create and run the cluster. We integrated the tools to start/stop/ +deploy and run the cluster tests in our repository and into Breeze development environment. -To run Kubernetes in Breeze, you can start Breeze with the ``--kind-cluster-start`` switch. This -automatically creates a Kind Kubernetes cluster in the same ``docker`` engine that is used to run Breeze. -Setting up the Kubernetes cluster takes some time so the cluster continues running -until the it is stopped with the ``--kind-cluster-stop`` switch or until the ``--kind-cluster-recreate`` -switch is used rather than ``--kind-cluster-start``. Starting Breeze with the Kind Cluster automatically -sets ``runtime`` to ``kubernetes`` (see below). +Configuration for the cluster is kept in ``./build/.kube/config`` file in your Airflow source repository and +our scripts set the ``KUBECONFIG`` variable to it. If you want to interact with the Kind cluster created +you can do it from outside of the scripts by exporting this variable and point it to this file. -The cluster name follows the pattern ``airflow-python-X.Y.Z-vA.B.C`` where X.Y.Z is a Python version +Starting Kubernetes Cluster +--------------------------- + +For your testing you manage Kind cluster with ``kind-cluster`` breeze command: + +.. code-block:: bash + + ./breeze kind-cluster [ start | stop | recreate | status | deploy | test | shell | k9s ] + +The command allows you to start/stop/recreate/status Kind Kubernetes cluster, deploy Airflow via Helm +chart as well as interact with the cluster (via test and shell commands). + +Setting up the Kind Kubernetes cluster takes some time so once you started it, the cluster continues running +until it is stopped with the ``kind-cluster stop`` command or until ``kind-cluster recreate`` +command is used (it will stop and recreate the cluster image). + +The cluster name follows the pattern ``airflow-python-X.Y-vA.B.C`` where X.Y is a Python version and A.B.C is a Kubernetes version. This way you can have multiple clusters set up and running at the same time for different Python versions and different Kubernetes versions. -The Control Plane is available from inside the Docker image via ``-control-plane:6443`` -host:port, the worker of the Kind Cluster is available at -worker -and webserver port for the worker is 30809. -After the Kubernetes Cluster is started, you need to deploy Airflow to the cluster: +Deploying Airflow to Kubernetes Cluster +--------------------------------------- -1. Build the image. -2. Load it to the Kubernetes cluster. -3. Deploy the Airflow application. +Deploying Airflow to the Kubernetes cluster created is also done via ``kind-cluster deploy`` breeze command: -It can be done with a single script: ``./scripts/ci/in_container/kubernetes/deploy_airflow_to_kubernetes.sh``. +.. code-block:: bash -You can, however, work separately on the image in Kubernetes and deploying the Airflow app in the cluster. + ./breeze kind-cluster deploy -Building and Loading Airflow Images to Kubernetes Cluster -.............................................................. +The deploy commands performs those steps: -Use the script ``./scripts/ci/in_container/kubernetes/docker/rebuild_airflow_image.sh`` that does the following: +1. It rebuilds the latest ``apache/airflow:master-pythonX.Y`` production images using the + latest sources using local cachine. It also adds example DAGs to the image, so that they do not + have to be mounted inside. +2. Loads the image to the Kind Cluster using the ``kind load`` command. +3. Starts airflow in the cluster using the official helm chart (in ``airflow`` namespace) +4. Forwards Local 8080 port to the webserver running in the cluster +5. Applies the volumes.yaml to get the volumes deployed to ``default`` namespace - this is where + KubernetesExecutor starts its pods. -1. Rebuilds the latest ``apache/airflow:master-pythonX.Y-ci`` images using the latest sources. -2. Builds a new Kubernetes image based on the ``apache/airflow:master-pythonX.Y-ci`` using - necessary scripts added to run in Kubernetes. The image is tagged as - ``apache/airflow:master-pythonX.Y-ci-kubernetes``. -3. Loads the image to the Kind Cluster using the ``kind load`` command. +Running tests with Kubernetes Cluster +------------------------------------- -Deploying the Airflow Application in the Kubernetes Cluster -........................................................... +You can either run all tests or you can select which tests to run. You can also enter interactive virtualenv +to run the tests manually one by one. -Use the script ``./scripts/ci/in_container/kubernetes/app/deploy_app.sh`` that does the following: +Running kubernetes tests via shell: -1. Prepares Kubernetes resources by processing a template from the ``template`` directory and replacing - variables with the right images and locations: - - configmaps.yaml - - airflow.yaml -2. Uses the existing resources without replacing any variables inside: - - secrets.yaml - - postgres.yaml - - volumes.yaml -3. Applies all the resources to the Kind Cluster. -4. Waits for all the applications to be ready and reachable. +.. code-block:: bash -After the deployment is finished, you can run Kubernetes tests immediately in the same way as other tests. -The Kubernetes tests are available in the ``tests/runtime/kubernetes`` folder. + ./scripts/ci/kubernetes/ci_run_kubernetes_tests.sh - runs all kubernetes tests + ./scripts/ci/kubernetes/ci_run_kubernetes_tests.sh TEST [TEST ...] - runs selected kubernetes tests (from kubernetes_tests folder) -You can run all the integration tests for Kubernetes with ``pytest tests/runtime/kubernetes``. +Running kubernetes tests via breeze: -Running Runtime-Specific Tests ------------------------------- +.. code-block:: bash -Tests using a specific runtime are marked with a custom pytest marker ``pytest.mark.runtime``. -The marker has a single parameter - the name of a runtime. At the moment the only supported runtime is -``kubernetes``. This runtime is set when you run Breeze with one of the ``--kind-cluster-*`` flags. -Runtime-specific tests run only when the selectd runtime is started. + ./breeze kind-cluster test + ./breeze kind-cluster test -- TEST TEST [TEST ...] -.. code-block:: python +Entering shell with Kubernetes Cluster +-------------------------------------- + +This shell is prepared to run kubernetes tests interactively. It has ``kubectl`` and ``kind`` cli tools +available in the path, it has also activated virtualenv environment that allows you to run tests via pytest. + +You can enter the shell via those scripts + + ./scripts/ci/kubernetes/ci_run_kubernetes_tests.sh [-i|--interactive] - Activates virtual environment ready to run tests and drops you in + ./scripts/ci/kubernetes/ci_run_kubernetes_tests.sh [--help] - Prints this help message + + +.. code-block:: bash + + ./breeze kind-cluster shell - @pytest.mark.runtime("kubernetes") - class TestKubernetesExecutor(unittest.TestCase): +K9s CLI - debug kubernetes in style! +------------------------------------ -You can use the custom ``--runtime`` switch in pytest to only run tests specific for that backend. +Breeze has built-in integration with fantastic k9s CLI tool, that allows you to debug the kubernetes +installation effortlessly and in style. K9S provides terminal (but windowed) CLI that allows you to +easily observe what's going on in the kubernetes instance, observe the resources defined (pods, secrets, +custom resource definitions), enter shell for the Pods/Containers running, see the log files and more. -To run only kubernetes-runtime backend tests, enter: +You can read more about k9s at `https://k9scli.io/ `_ + +Here is the screenshot of k9s tools in operation: + +.. image:: images/testing/k9s.png + :align: center + :alt: K9S tool + + +You can enter the k9s tool via breeze (after you deployed Airflow): .. code-block:: bash - pytest --runtime kubernetes + ./breeze kind-cluster k9s + +You can exit k9s by pressing Ctrl-C. -**NOTE:** For convenience and faster search, all runtime tests are stored in the ``tests.runtime`` package. In this case, you -can speed up the collection of tests by running: +Typical testing pattern for Kubernetes tests +-------------------------------------------- + +The typical session for tests with Kubernetes looks like follows: + +1. Start the Kind cluster: .. code-block:: bash - pytest --runtime kubernetes tests/runtime + ./breeze kind-cluster start -Travis CI Testing Framework -=========================== + Starts Kind Kubernetes cluster + + Use CI image. + + Branch name: master + Docker image: apache/airflow:master-python3.7-ci + + Airflow source version: 2.0.0.dev0 + Python version: 3.7 + DockerHub user: apache + DockerHub repo: airflow + Backend: postgres 9.6 + + No kind clusters found. + + Creating cluster + + Creating cluster "airflow-python-3.7-v1.17.0" ... + ✓ Ensuring node image (kindest/node:v1.17.0) 🖼 + ✓ Preparing nodes 📦 📦 + ✓ Writing configuration 📜 + ✓ Starting control-plane 🕹️ + ✓ Installing CNI 🔌 + Could not read storage manifest, falling back on old k8s.io/host-path default ... + ✓ Installing StorageClass 💾 + ✓ Joining worker nodes 🚜 + Set kubectl context to "kind-airflow-python-3.7-v1.17.0" + You can now use your cluster with: -Airflow test suite is based on Travis CI framework as running all of the tests -locally requires significant setup. You can set up Travis CI in your fork of -Airflow by following the -`Travis CI Getting Started guide `__. + kubectl cluster-info --context kind-airflow-python-3.7-v1.17.0 -Consider using Travis CI framework if you submit multiple pull requests -and want to speed up your builds. + Have a question, bug, or feature request? Let us know! https://kind.sigs.k8s.io/#community 🙂 -There are two different options available for running Travis CI, and they are -set up on GitHub as separate components: + Created cluster airflow-python-3.7-v1.17.0 -- **Travis CI GitHub App** (new version) -- **Travis CI GitHub Services** (legacy version) -Travis CI GitHub App (new version) ----------------------------------- +2. Check the status of the cluster -1. Once `installed `__, - configure the Travis CI GitHub App at - `Configure Travis CI `__. +.. code-block:: bash + + ./breeze kind-cluster status + + Checks status of Kind Kubernetes cluster + + Use CI image. + + Branch name: master + Docker image: apache/airflow:master-python3.7-ci + + Airflow source version: 2.0.0.dev0 + Python version: 3.7 + DockerHub user: apache + DockerHub repo: airflow + Backend: postgres 9.6 + + airflow-python-3.7-v1.17.0-control-plane + airflow-python-3.7-v1.17.0-worker + +3. Deploy Airflow to the cluster + +.. code-block:: bash + + ./breeze kind-cluster deploy -2. Set repository access to either "All repositories" for convenience, or "Only - select repositories" and choose ``USERNAME/airflow`` in the drop-down menu. +4. Run Kubernetes tests -3. Access Travis CI for your fork at ``__. +Note that the tests are executed in production container not in the CI container. +There is no need for the tests to run inside the Airflow CI container image as they only +communicate with the Kubernetes-run Airflow deployed via the production image. +Those Kubernetes tests require virtualenv to be created locally with airflow installed. +The virtualenv required will be created automatically when the scripts are run. -Travis CI GitHub Services (legacy version) ------------------------------------------- +4a) You can run all the tests -**NOTE:** The apache/airflow project is still using the legacy version. +.. code-block:: bash -Travis CI GitHub Services version uses an Authorized OAuth App. + ./breeze kind-cluster test -1. Once installed, configure the Travis CI Authorized OAuth App at - `Travis CI OAuth APP `__. -2. If you are a GitHub admin, click the **Grant** button next to your - organization; otherwise, click the **Request** button. For the Travis CI - Authorized OAuth App, you may have to grant access to the forked - ``ORGANIZATION/airflow`` repo even though it is public. +4b) You can enter an interactive shell to run tests one-by-one -3. Access Travis CI for your fork at - ``_. +This prepares and enters the virtualenv in ``.build/.kubernetes_venv_`` folder: + +.. code-block:: bash -Creating New Projects in Travis CI ----------------------------------- + ./breeze kind-cluster shell + +Once you enter the environment you receive this information: + + +.. code-block:: bash -If you need to create a new project in Travis CI, use travis-ci.com for both -private repos and open source. + Activating the virtual environment for kubernetes testing + + You can run kubernetes testing via 'pytest kubernetes_tests/....' + You can add -s to see the output of your tests on screen + + The webserver is available at http://localhost:8080/ + + User/password: admin/admin + + You are entering the virtualenv now. Type exit to exit back to the original shell + +In a separate terminal you can open the k9s CLI: + +.. code-block:: bash + + ./breeze kind-cluster k9s + +Use it to observe what's going on in your cluster. + +6. Debugging in IntelliJ/PyCharm + +It is very easy to running/debug Kubernetes tests with IntelliJ/PyCharm. Unlike the regular tests they are +in ``kubernetes_tests`` folder and if you followed the previous steps and entered the shell using +``./breeze kind-cluster shell`` command, you can setup your IDE very easily to run (and debug) your +tests using the standard IntelliJ Run/Debug feature. You just need a few steps: + +a) Add the virtualenv as interpreter for the project: + +.. image:: images/testing/kubernetes-virtualenv.png + :align: center + :alt: Kubernetes testing virtualenv + +The virtualenv is created in your "Airflow" source directory in the +``.build/.kubernetes_venv_`` folder and you +have to find ``python`` binary and choose it when selecting interpreter. + +b) Choose pytest as test runner: + +.. image:: images/testing/pytest-runner.png + :align: center + :alt: Pytest runner + +c) Run/Debug tests using standard "Run/Debug" feature of IntelliJ + +.. image:: images/testing/run-tests.png + :align: center + :alt: Run/Debug tests + + +NOTE! The first time you run it, it will likely fail with +``kubernetes.config.config_exception.ConfigException``: +``Invalid kube-config file. Expected key current-context in kube-config``. You need to add KUBECONFIG +environment variabl copying it from the result of "./breeze kind-cluster test": + +.. code-block:: bash + + echo ${KUBECONFIG} + + /home/jarek/code/airflow/.build/.kube/config + + +.. image:: images/testing/kubeconfig-env.png + :align: center + :alt: Run/Debug tests + + +The configuration for kubernetes is stored in your "Airflow" source directory in ".build/.kube/config" file +and this is where KUBECONFIG env should point to. + +You can iterate with tests while you are in the virtualenv. All the tests requiring kubernetes cluster +are in "kubernetes_tests" folder. You can add extra ``pytest`` parameters then (for example ``-s`` will +print output generated test logs and print statements to the terminal immediately. + +.. code-block:: bash -The travis-ci.org site for open source projects is now legacy and you should not use it. + pytest kubernetes_tests/test_kubernetes_executor.py::TestKubernetesExecutor::test_integration_run_dag_with_scheduler_failure -s -.. - There is a second Authorized OAuth App available called **Travis CI for Open Source** used - for the legacy travis-ci.org service. Don't use it for new projects! -More information: +You can modify the tests or KubernetesPodOperator and re-run them without re-deploying +airflow to KinD cluster. + + +Sometimes there are side effects from running tests. You can run ``redeploy_airflow.sh`` without +recreating the whole cluster. This will delete the whole namespace, including the database data +and start a new Airflow deployment in the cluster. + +.. code-block:: bash + + ./scripts/ci/redeploy_airflow.sh + +If needed you can also delete the cluster manually: + + +.. code-block:: bash + + kind get clusters + kind delete clusters + +Kind has also useful commands to inspect your running cluster: + +.. code-block:: text + + kind --help + + +However, when you change Airflow Kubernetes executor implementation you need to redeploy +Airflow to the cluster. + +.. code-block:: bash + + ./breeze kind-cluster deploy + + +7. Stop KinD cluster when you are done + +.. code-block:: bash + + ./breeze kind-cluster stop -- `Open Source on travis-ci.com `__. -- `Legacy GitHub Services to GitHub Apps Migration Guide `__. -- `Migrating Multiple Repositories to GitHub Apps Guide `__. Airflow System Tests ==================== @@ -480,21 +774,65 @@ if you have appropriate credentials configured for your tests. The system tests derive from the ``tests.test_utils.system_test_class.SystemTests`` class. They should also be marked with ``@pytest.marker.system(SYSTEM)`` where ``system`` designates the system to be tested (for example, ``google.cloud``). These tests are skipped by default. -You can execute the system tests by providing the ``--systems SYSTEMS`` flag to ``pytest``. + +You can execute the system tests by providing the ``--system SYSTEM`` flag to ``pytest``. You can +specify several --system flags if you want to execute tests for several systems. The system tests execute a specified example DAG file that runs the DAG end-to-end. See more details about adding new system tests below. -Running System Tests --------------------- +Environment for System Tests +---------------------------- + **Prerequisites:** You may need to set some variables to run system tests. If you need to -add some intialization of environment variables to Breeze, you can always add a +add some initialization of environment variables to Breeze, you can add a ``variables.env`` file in the ``files/airflow-breeze-config/variables.env`` file. It will be automatically -sourced when entering the Breeze environment. +sourced when entering the Breeze environment. You can also add some additional +initialization commands in this file if you want to execute something +always at the time of entering Breeze. + +There are several typical operations you might want to perform such as: + +* generating a file with the random value used across the whole Breeze session (this is useful if + you want to use this random number in names of resources that you create in your service +* generate variables that will be used as the name of your resources +* decrypt any variables and resources you keep as encrypted in your configuration files +* install additional packages that are needed in case you are doing tests with 1.10.* Airflow series + (see below) + +Example variables.env file is shown here (this is part of the variables.env file that is used to +run Google Cloud system tests. + +.. code-block:: bash + + # Build variables. This file is sourced by Breeze. + # Also it is sourced during continuous integration build in Cloud Build + + # Auto-export all variables + set -a + + echo + echo "Reading variables" + echo + + # Generate random number that will be used across your session + RANDOM_FILE="/random.txt" + + if [[ ! -f "${RANDOM_FILE}" ]]; then + echo "${RANDOM}" > "${RANDOM_FILE}" + fi + + RANDOM_POSTFIX=$(cat "${RANDOM_FILE}") + + # install any packages from dist folder if they are available + if [[ ${RUN_AIRFLOW_1_10:=} == "true" ]]; then + pip install /dist/apache_airflow_providers_{google,postgres,mysql}*.whl || true + fi + +To execute system tests, specify the ``--system SYSTEM` +flag where ``SYSTEM`` is a system to run the system tests for. It can be repeated. -To execute system tests, specify the ``--systems SYSTEMS`` -flag where ``SYSTEMS`` is a coma-separated list of systems to run the system tests for. Forwarding Authentication from the Host ---------------------------------------------------- @@ -505,18 +843,15 @@ credentials stored in your ``home`` directory. Use this feature with care as it visible to anything that you have installed inside the Docker container. Currently forwarded credentials are: - * all credentials stored in ``${HOME}/.config`` (for example, GCP credentials) - * credentials stored in ``${HOME}/.gsutil`` for ``gsutil`` tool from GCS - * credentials stored in ``${HOME}/.boto`` and ``${HOME}/.s3`` (for AWS authentication) - * credentials stored in ``${HOME}/.docker`` for docker - * credentials stored in ``${HOME}/.kube`` for kubectl - * credentials stored in ``${HOME}/.ssh`` for SSH - + * credentials stored in ``${HOME}/.aws`` for the aws Amazon Web Services client + * credentials stored in ``${HOME}/.azure`` for the az Microsoft Azure client + * credentials stored in ``${HOME}/.config`` for gcloud Google Cloud Platform client (among others) + * credentials stored in ``${HOME}/.docker`` for docker client Adding a New System Test -------------------------- -We are working on automating system tests execution (AIP-4) but for now system tests are skipped when +We are working on automating system tests execution (AIP-4) but for now, system tests are skipped when tests are run in our CI system. But to enable the test automation, we encourage you to add system tests whenever an operator/hook/sensor is added/modified in a given system. @@ -525,50 +860,210 @@ tests whenever an operator/hook/sensor is added/modified in a given system. ``@pytest.mark.system(SYSTEM_NAME)`` marker. The system name should follow the path defined in the ``providers`` package (for example, the system tests from ``tests.providers.google.cloud`` package should be marked with ``@pytest.mark.system("google.cloud")``. + * If your system tests need some credential files to be available for an authentication with external systems, make sure to keep these credentials in the ``files/airflow-breeze-config/keys`` directory. Mark your tests with ``@pytest.mark.credential_file()`` so that they are skipped if such a credential file is not there. - The tests should read the right credentials and authenticate on their own. The credentials are read + The tests should read the right credentials and authenticate them on their own. The credentials are read in Breeze from the ``/files`` directory. The local "files" folder is mounted to the "/files" folder in Breeze. -* If your system tests are long-lasting ones (i.e., require more than 20-30 minutes + +* If your system tests are long-runnin ones (i.e., require more than 20-30 minutes to complete), mark them with the ```@pytest.markers.long_running`` marker. - Such tests are skipped by default unless you specify the ``--long-lasting`` flag to pytest. + Such tests are skipped by default unless you specify the ``--long-running`` flag to pytest. + * The system test itself (python class) does not have any logic. Such a test runs the DAG specified by its ID. This DAG should contain the actual DAG logic to execute. Make sure to define the DAG in ``providers//example_dags``. These example DAGs are also used to take some snippets of code out of them when documentation is generated. So, having these - DAGs runnable is a great way to make sure the documenation is describing a working example. Inside - your test class/test method, simply use ``self.run_dag(,)`` to run the DAG. Then, + DAGs runnable is a great way to make sure the documentation is describing a working example. Inside your test class/test method, simply use ``self.run_dag(,)`` to run the DAG. Then, the system class will take care about running the DAG. Note that the DAG_FOLDER should be a subdirectory of the ``tests.test_utils.AIRFLOW_MAIN_FOLDER`` + ``providers//example_dags``. -An example of a system test is available in: -``airflow.tests.providers.google.operators.test_natunal_language_system.CloudNaturalLanguageExampleDagsTest``. +A simple example of a system test is available in: -It runs the DAG defined in ``airflow.providers.google.cloud.example_dags.example_natural_language.py``. +``tests/providers/google/cloud/operators/test_compute_system.py``. -Running Tests for Older Airflow Versions ----------------------------------------- +It runs two DAGs defined in ``airflow.providers.google.cloud.example_dags.example_compute.py`` and +``airflow.providers.google.cloud.example_dags.example_compute_igm.py``. + +Installing backported for Airflow 1.10.* series +----------------------------------------------- The tests can be executed against the master version of Airflow but they also work with older versions. This is especially useful to test back-ported operators from Airflow 2.0 to 1.10.* versions. To run the tests for Airflow 1.10.* series, you need to run Breeze with -``--install-airflow-version==`` to install a different version of Airflow. +``--install-airflow-version=`` to install a different version of Airflow. If ``current`` is specified (default), then the current version of Airflow is used. Otherwise, the released version of Airflow is installed. -The commands make sure that the source version of master Airflow is removed and the released version of -Airflow from ``Pypi`` is installed. Note that tests sources are not removed and they can be used -to run tests (unit tests and system tests) against the freshly installed version. +The ``-install-airflow-version=`` command make sure that the current (from sources) version of +Airflow is removed and the released version of Airflow from ``Pypi`` is installed. Note that tests sources +are not removed and they can be used to run tests (unit tests and system tests) against the +freshly installed version. + +You should automate installing of the backport packages in your own +``./files/airflow-breeze-config/variables.env`` file. You should make it depend on +``RUN_AIRFLOW_1_10`` variable value equals to "true" so that +the installation of backport packages is only performed when you install airflow 1.10.*. +The backport packages are available in ``/dist`` directory if they were prepared as described +in the previous chapter. + +Typically the command in you variables.env file will be similar to: + +.. code-block:: bash + + # install any packages from dist folder if they are available + if [[ ${RUN_AIRFLOW_1_10:=} == "true" ]]; then + pip install /dist/apache_airflow_providers_{google,postgres,mysql}*.whl || true + fi + +The command above will automatically install backported google, postgres, and mysql packages if they +were prepared before entering the breeze. + + +Running system tests for backported packages in Airflow 1.10.* series +--------------------------------------------------------------------- + +Once you installed 1.10.* Airflow version with ``--install-airflow-version`` and prepared and +installed the required packages via ``variables.env`` it should be as easy as running +``pytest --system= TEST_NAME``. Note that we have default timeout for running +system tests set to 8 minutes and some system tests might take much longer to run and you might +want to add ``-o faulthandler_timeout=2400`` (2400s = 40 minutes for example) to your +pytest command. + +The typical system test session +--------------------------- + +Here is the typical session that you need to do to run system tests: + +1. Prepare backport packages + +.. code-block:: bash + + ./scripts/ci/ci_prepare_backport_packages.sh google postgres mysql + +2. Enter breeze with installing Airflow 1.10.*, forwarding credentials and installing + backported packages (you need an appropriate line in ``./files/airflow-breeze-config/variables.env``) + +.. code-block:: bash + + ./breeze --install-airflow-version 1.10.9 --python 3.6 --db-reset --forward-credentials restart + +This will: + +* install Airflow 1.10.9 +* restarts the whole environment (i.e. recreates metadata database from the scratch) +* run Breeze with python 3.6 version +* reset the Airflow database +* forward your local credentials to Breeze + +3. Run the tests: + +.. code-block:: bash + + pytest -o faulthandler_timeout=2400 \ + --system=google tests/providers/google/cloud/operators/test_compute_system.py + + +Iteration with System Tests if your resources are slow to create +---------------------------------------------------------------- + +When you want to iterate on system tests, you might want to create slow resources first. + +If you need to set up some external resources for your tests (for example compute instances in Google Cloud) +you should set them up and teardown in the setUp/tearDown methods of your tests. +Since those resources might be slow to create you might want to add some helpers that +set them up and tear them down separately via manual operations. This way you can iterate on +the tests without waiting for setUp and tearDown with every test. + +In this case, you should build in a mechanism to skip setUp and tearDown in case you manually +created the resources. A somewhat complex example of that can be found in +``tests.providers.google.cloud.operators.test_cloud_sql_system.py`` and the helper is +available in ``tests.providers.google.cloud.operators.test_cloud_sql_system_helper.py``. + +When the helper is run with ``--action create`` to create cloud sql instances which are very slow +to create and set-up so that you can iterate on running the system tests without +losing the time for creating theme every time. A temporary file is created to prevent from +setting up and tearing down the instances when running the test. + +This example also shows how you can use the random number generated at the entry of Breeze if you +have it in your variables.env (see the previous chapter). In the case of Cloud SQL, you cannot reuse the +same instance name for a week so we generate a random number that is used across the whole session +and store it in ``/random.txt`` file so that the names are unique during tests. + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Important !!!!!!!!!!!!!!!!!!!!!!!!!!!! + +Do not forget to delete manually created resources before leaving the +Breeze session. They are usually expensive to run. + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Important !!!!!!!!!!!!!!!!!!!!!!!!!!!! + +Note that in case you have to update your backported operators or system tests (they are part of +the backport packageS) you need to rebuild the packages outside of breeze and +``pip remove/pip install`` those packages to get them installed. This is not needed +if you run system tests with ``current`` airflow version, so it is better to iterate with the +system tests with the ``current`` version and fix all problems there and only afterwards run +the tests with Airflow 1.10.* + +The typical session then looks as follows: + +1. Prepare backport packages + +.. code-block:: bash + + ./scripts/ci/ci_prepare_backport_packages.sh google postgres mysql + +2. Enter breeze with installing Airflow 1.10.*, forwarding credentials and installing + backported packages (you need an appropriate line in ``./files/airflow-breeze-config/variables.env``) + +.. code-block:: bash + + ./breeze --install-airflow-version 1.10.9 --python 3.6 --db-reset --forward-credentials restart + +3. Run create action in helper (to create slowly created resources): + +.. code-block:: bash + + python tests/providers/google/cloud/operators/test_cloud_sql_system_helper.py --action create + +4. Run the tests: + +.. code-block:: bash + + pytest -o faulthandler_timeout=2400 \ + --system=google tests/providers/google/cloud/operators/test_compute_system.py + +5. In case you are running backport packages tests you need to rebuild and reinstall a package + every time you change the operators/hooks or example_dags. The example below shows reinstallation + of the google package: + +In the host: + +.. code-block:: bash + + ./scripts/ci/ci_prepare_backport_packages.sh google + +In the container: + +.. code-block:: bash + + pip uninstall apache-airflow-providers-google + pip install /dist/apache_airflow_providers_google-*.whl + +The points 4. and 5. can be repeated multiple times without leaving the container + +6. Run delete action in helper: + +.. code-block:: bash + + python tests/providers/google/cloud/operators/test_cloud_sql_system_helper.py --action delete -This works best for system tests: all the system tests should work for at least latest released 1.10.x -Airflow version. Some of the unit and integration tests might also work in the same -fashion but it is not necessary or expected. Local and Remote Debugging in IDE ================================= @@ -588,7 +1083,7 @@ You can set up your remote debugging session as follows: :align: center :alt: Setup remote debugging -Note that on macOS, you have to use a real IP address of your host rather than default +Note that on macOS, you have to use a real IP address of your host rather than the default localhost because on macOS the container runs in a virtual machine with a different IP address. Make sure to configure source code mapping in the remote debugging configuration to map @@ -598,10 +1093,99 @@ your local sources to the ``/opt/airflow`` location of the sources within the co :align: center :alt: Source code mapping +Setup VM on GCP with SSH forwarding +----------------------------------- + +Below are the steps you need to take to set up your virtual machine in the Google Cloud Platform. + +1. The next steps will assume that you have configured environment variables with the name of the network and + a virtual machine, project ID and the zone where the virtual machine will be created + + .. code-block:: bash + + PROJECT_ID="" + GCP_ZONE="europe-west3-a" + GCP_NETWORK_NAME="airflow-debugging" + GCP_INSTANCE_NAME="airflow-debugging-ci" + +2. It is necessary to configure the network and firewall for your machine. + The firewall must have unblocked access to port 22 for SSH traffic and any other port for the debugger. + In the example for the debugger, we will use port 5555. + + .. code-block:: bash + + gcloud compute --project="${PROJECT_ID}" networks create "${GCP_NETWORK_NAME}" \ + --subnet-mode=auto + + gcloud compute --project="${PROJECT_ID}" firewall-rules create "${GCP_NETWORK_NAME}-allow-ssh" \ + --network "${GCP_NETWORK_NAME}" \ + --allow tcp:22 \ + --source-ranges 0.0.0.0/0 + + gcloud compute --project="${PROJECT_ID}" firewall-rules create "${GCP_NETWORK_NAME}-allow-debugger" \ + --network "${GCP_NETWORK_NAME}" \ + --allow tcp:5555 \ + --source-ranges 0.0.0.0/0 + +3. If you have a network, you can create a virtual machine. To save costs, you can create a `Preemptible + virtual machine ` that is automatically deleted for up + to 24 hours. + + .. code-block:: bash + + gcloud beta compute --project="${PROJECT_ID}" instances create "${GCP_INSTANCE_NAME}" \ + --zone="${GCP_ZONE}" \ + --machine-type=f1-micro \ + --subnet="${GCP_NETWORK_NAME}" \ + --image=debian-10-buster-v20200210 \ + --image-project=debian-cloud \ + --preemptible + + To check the public IP address of the machine, you can run the command + + .. code-block:: bash + + gcloud compute --project="${PROJECT_ID}" instances describe "${GCP_INSTANCE_NAME}" \ + --zone="${GCP_ZONE}" \ + --format='value(networkInterfaces[].accessConfigs[0].natIP.notnull().list())' + +4. The SSH Deamon's default configuration does not allow traffic forwarding to public addresses. + To change it, modify the ``GatewayPorts`` options in the ``/etc/ssh/sshd_config`` file to ``Yes`` + and restart the SSH daemon. + + .. code-block:: bash + + gcloud beta compute --project="${PROJECT_ID}" ssh "${GCP_INSTANCE_NAME}" \ + --zone="${GCP_ZONE}" -- \ + sudo sed -i "s/#\?\s*GatewayPorts no/GatewayPorts Yes/" /etc/ssh/sshd_config + + gcloud beta compute --project="${PROJECT_ID}" ssh "${GCP_INSTANCE_NAME}" \ + --zone="${GCP_ZONE}" -- \ + sudo service sshd restart + +5. To start port forwarding, run the following command: + + .. code-block:: bash + + gcloud beta compute --project="${PROJECT_ID}" ssh "${GCP_INSTANCE_NAME}" \ + --zone="${GCP_ZONE}" -- \ + -N \ + -R 0.0.0.0:5555:localhost:5555 \ + -v + +If you have finished using the virtual machine, remember to delete it. + + .. code-block:: bash + + gcloud beta compute --project="${PROJECT_ID}" instances delete "${GCP_INSTANCE_NAME}" \ + --zone="${GCP_ZONE}" + +You can use the GCP service for free if you use the `Free Tier `__. + DAG Testing =========== -To ease and speed up process of developing DAGs, you can use +To ease and speed up the process of developing DAGs, you can use py:class:`~airflow.executors.debug_executor.DebugExecutor`, which is a single process executor for debugging purposes. Using this executor, you can run and debug DAGs from your IDE. @@ -638,9 +1222,9 @@ To run the tests for Airflow 1.10.* series, you need to run Breeze with If ``current`` is specified (default), then the current version of Airflow is used. Otherwise, the released version of Airflow is installed. -You should also consider running it with ``restart`` command when you change installed version. +You should also consider running it with ``restart`` command when you change the installed version. This will clean-up the database so that you start with a clean DB and not DB installed in a previous version. -So typically you'd run it like ``breeze --install-ariflow-version=1.10.9 restart``. +So typically you'd run it like ``breeze --install-airflow-version=1.10.9 restart``. BASH Unit Testing (BATS) ======================== diff --git a/UPDATING.md b/UPDATING.md index fbea0b3af9d88..78a3f0b0690d3 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -25,6 +25,11 @@ assists users migrating to a new version. **Table of contents** +- [Airflow 1.10.15](#airflow-11015) +- [Airflow 1.10.14](#airflow-11014) +- [Airflow 1.10.13](#airflow-11013) +- [Airflow 1.10.12](#airflow-11012) +- [Airflow 1.10.11](#airflow-11011) - [Airflow 1.10.10](#airflow-11010) - [Airflow 1.10.9](#airflow-1109) - [Airflow 1.10.8](#airflow-1108) @@ -59,6 +64,194 @@ https://developers.google.com/style/inclusive-documentation --> +## Airflow 1.10.15 + +No breaking changes. + +## Airflow 1.10.14 + +### `[scheduler] max_threads` config has been renamed to `[scheduler] parsing_processes` + +From Airflow 1.10.14, `max_threads` config under `[scheduler]` section has been renamed to `parsing_processes`. + +This is to align the name with the actual code where the Scheduler launches the number of processes defined by +`[scheduler] parsing_processes` to parse the DAG files. + +### Airflow CLI changes in line with 2.0 + +The Airflow CLI has been organized so that related commands are grouped together as subcommands, +which means that if you use these commands in your scripts, you have to make changes to them. + +This section describes the changes that have been made, and what you need to do to update your script. + +The ability to manipulate users from the command line has been changed. ``airflow create_user``, ``airflow delete_user`` + and ``airflow list_users`` has been grouped to a single command `airflow users` with optional flags `create`, `list` and `delete`. + +The `airflow list_dags` command is now `airflow dags list`, `airflow pause` is `airflow dags pause`, etc. + +In Airflow 1.10 and 2.0 there is an `airflow config` command but there is a difference in behavior. In Airflow 1.10, +it prints all config options while in Airflow 2.0, it's a command group. `airflow config` is now `airflow config list`. +You can check other options by running the command `airflow config --help` + +Compatibility with the old CLI has been maintained, but they will no longer appear in the help + +You can learn about the commands by running ``airflow --help``. For example to get help about the ``celery`` group command, +you have to run the help command: ``airflow celery --help``. + +| Old command | New command | Group | +|-------------------------------|------------------------------------|--------------------| +| ``airflow worker`` | ``airflow celery worker`` | ``celery`` | +| ``airflow flower`` | ``airflow celery flower`` | ``celery`` | +| ``airflow trigger_dag`` | ``airflow dags trigger`` | ``dags`` | +| ``airflow delete_dag`` | ``airflow dags delete`` | ``dags`` | +| ``airflow show_dag`` | ``airflow dags show`` | ``dags`` | +| ``airflow list_dag`` | ``airflow dags list`` | ``dags`` | +| ``airflow dag_status`` | ``airflow dags status`` | ``dags`` | +| ``airflow backfill`` | ``airflow dags backfill`` | ``dags`` | +| ``airflow list_dag_runs`` | ``airflow dags list-runs`` | ``dags`` | +| ``airflow pause`` | ``airflow dags pause`` | ``dags`` | +| ``airflow unpause`` | ``airflow dags unpause`` | ``dags`` | +| ``airflow next_execution`` | ``airflow dags next-execution`` | ``dags`` | +| ``airflow test`` | ``airflow tasks test`` | ``tasks`` | +| ``airflow clear`` | ``airflow tasks clear`` | ``tasks`` | +| ``airflow list_tasks`` | ``airflow tasks list`` | ``tasks`` | +| ``airflow task_failed_deps`` | ``airflow tasks failed-deps`` | ``tasks`` | +| ``airflow task_state`` | ``airflow tasks state`` | ``tasks`` | +| ``airflow run`` | ``airflow tasks run`` | ``tasks`` | +| ``airflow render`` | ``airflow tasks render`` | ``tasks`` | +| ``airflow initdb`` | ``airflow db init`` | ``db`` | +| ``airflow resetdb`` | ``airflow db reset`` | ``db`` | +| ``airflow upgradedb`` | ``airflow db upgrade`` | ``db`` | +| ``airflow checkdb`` | ``airflow db check`` | ``db`` | +| ``airflow shell`` | ``airflow db shell`` | ``db`` | +| ``airflow pool`` | ``airflow pools`` | ``pools`` | +| ``airflow create_user`` | ``airflow users create`` | ``users`` | +| ``airflow delete_user`` | ``airflow users delete`` | ``users`` | +| ``airflow list_users`` | ``airflow users list`` | ``users`` | +| ``airflow rotate_fernet_key`` | ``airflow rotate-fernet-key`` | | +| ``airflow sync_perm`` | ``airflow sync-perm`` | | + +## Airflow 1.10.13 + +### TimeSensor is now timezone aware + +Previously `TimeSensor` always compared the `target_time` with the current time in UTC. + +Now it will compare `target_time` with the current time in the timezone of the DAG, +defaulting to the `default_timezone` in the global config. + +### Removed Kerberos support for HDFS hook + +The HDFS hook's Kerberos support has been removed due to removed python-krbV dependency from PyPI +and generally lack of support for SSL in Python3 (Snakebite-py3 we use as dependency has no +support for SSL connection to HDFS). + +SSL support still works for WebHDFS hook. + +### Unify user session lifetime configuration + +In previous version of Airflow user session lifetime could be configured by +`session_lifetime_days` and `force_log_out_after` options. In practise only `session_lifetime_days` +had impact on session lifetime, but it was limited to values in day. +We have removed mentioned options and introduced new `session_lifetime_minutes` +option which simplify session lifetime configuration. + +Before + + ```ini +[webserver] +force_log_out_after = 0 +session_lifetime_days = 30 + ``` + +After + + ```ini +[webserver] +session_lifetime_minutes = 43200 + ``` + +### Adding Operators, Hooks and Sensors via Airflow Plugins is deprecated + +The ability to import Operators, Hooks and Senors via the plugin mechanism has been deprecated and will raise warnings +in Airflow 1.10.13 and will be removed completely in Airflow 2.0. + +Check http://airflow.apache.org/docs/1.10.13/howto/custom-operator.html to see how you can create and import +Custom Hooks, Operators and Sensors. + +## Airflow 1.10.12 + +### Clearing tasks skipped by SkipMixin will skip them + +Previously, when tasks skipped by SkipMixin (such as BranchPythonOperator, BaseBranchOperator and ShortCircuitOperator) are cleared, they execute. Since 1.10.12, when such skipped tasks are cleared, +they will be skipped again by the newly introduced NotPreviouslySkippedDep. + +### The pod_mutation_hook function will now accept a kubernetes V1Pod object + +As of airflow 1.10.12, using the `airflow.contrib.kubernetes.Pod` class in the `pod_mutation_hook` is now deprecated. Instead we recommend that users +treat the `pod` parameter as a `kubernetes.client.models.V1Pod` object. This means that users now have access to the full Kubernetes API +when modifying airflow pods + +### pod_template_file option now available in the KubernetesPodOperator + +Users can now offer a path to a yaml for the KubernetesPodOperator using the `pod_template_file` parameter. + +## Airflow 1.10.11 + +### Use NULL as default value for dag.description + +Now use NULL as default value for dag.description in dag table + +### Restrict editing DagRun State in the old UI (Flask-admin based UI) + +Before 1.10.11 it was possible to edit DagRun State in the `/admin/dagrun/` page + to any text. + +In Airflow 1.10.11+, the user can only choose the states from the list. + +### Experimental API will deny all request by default. + +The previous default setting was to allow all API requests without authentication, but this poses security +risks to users who miss this fact. This changes the default for new installs to deny all requests by default. + +**Note**: This will not change the behavior for existing installs, please update check your airflow.cfg + +If you wish to have the experimental API work, and aware of the risks of enabling this without authentication +(or if you have your own authentication layer in front of Airflow) you can get +the previous behaviour on a new install by setting this in your airflow.cfg: + +``` +[api] +auth_backend = airflow.api.auth.backend.default +``` + +### XCom Values can no longer be added or changed from the Webserver + +Since XCom values can contain pickled data, we would no longer allow adding or +changing XCom values from the UI. + +### Default for `run_as_user` configured has been changed to 50000 from 0 + +The UID to run the first process of the Worker PODs when using has been changed to `50000` +from the previous default of `0`. The previous default was an empty string but the code used `0` if it was +empty string. + +**Before**: + +```ini +[kubernetes] +run_as_user = +``` + +**After**: + +```ini +[kubernetes] +run_as_user = 50000 +``` + +This is done to avoid running the container as `root` user. + ## Airflow 1.10.10 ### Setting Empty string to a Airflow Variable will return an empty string @@ -99,7 +292,6 @@ When task is marked failed by user or task fails due to system failures - on fai See [AIRFLOW-5621](https://jira.apache.org/jira/browse/AIRFLOW-5621) for details - ## Airflow 1.10.7 ### Changes in experimental API execution_date microseconds replacement @@ -171,12 +363,6 @@ This is the correct behavior for use with BigQuery, since BigQuery assumes that TIMESTAMP columns without time zones are in UTC. To preserve the previous behavior, set `ensure_utc` to `False.` -### Python 2 support is going away - -Airflow 1.10 will be the last release series to support Python 2. Airflow 2.0.0 will only support Python 3.5 and up. - -If you have a specific task that still requires Python 2 then you can use the PythonVirtualenvOperator for this. - ### Changes to DatastoreHook * removed argument `version` from `get_conn` function and added it to the hook's `__init__` function instead and renamed it to `api_version` @@ -304,7 +490,7 @@ If the `AIRFLOW_CONFIG` environment variable was not set and the will discover its config file using the `$AIRFLOW_CONFIG` and `$AIRFLOW_HOME` environment variables rather than checking for the presence of a file. -### Changes in Google Cloud Platform related operators +### Changes in Google Cloud related operators Most GCP-related operators have now optional `PROJECT_ID` parameter. In case you do not specify it, the project id configured in @@ -331,7 +517,7 @@ Operators involved: Other GCP operators are unaffected. -### Changes in Google Cloud Platform related hooks +### Changes in Google Cloud related hooks The change in GCP operators implies that GCP Hooks for those operators require now keyword parameters rather than positional ones in all methods where `project_id` is used. The methods throw an explanatory exception @@ -399,7 +585,7 @@ gct_hook.create_transfer_job(body) ``` The change results from the unification of all hooks and adjust to [the official recommendations](https://lists.apache.org/thread.html/e8534d82be611ae7bcb21ba371546a4278aad117d5e50361fd8f14fe@%3Cdev.airflow.apache.org%3E) -for the Google Cloud Platform. +for the Google Cloud. The signature of `wait_for_transfer_job` method in `GCPTransferServiceHook` has changed. @@ -1006,14 +1192,11 @@ dags_are_paused_at_creation = False If you specify a hive conf to the run_cli command of the HiveHook, Airflow add some convenience variables to the config. In case you run a secure Hadoop setup it might be -required to whitelist these variables by adding the following to your configuration: +required to allow these variables by adjusting you hive configuration to add `airflow\.ctx\..*` to the regex +of user-editable configuration properties. See +[the Hive docs on Configuration Properties][hive.security.authorization.sqlstd] for more info. -``` - - hive.security.authorization.sqlstd.confwhitelist.append - airflow\.ctx\..* - -``` +[hive.security.authorization.sqlstd]: https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=82903061#ConfigurationProperties-SQLStandardBasedAuthorization.1 ### Google Cloud Operator and Hook alignment diff --git a/airflow/__init__.py b/airflow/__init__.py index 3ac543f6248cd..287efe39e57a4 100644 --- a/airflow/__init__.py +++ b/airflow/__init__.py @@ -32,6 +32,7 @@ __version__ = version.version +import logging import sys # flake8: noqa: F401 @@ -39,18 +40,20 @@ from airflow import utils from airflow import settings from airflow.configuration import conf -from airflow.models import DAG from flask_admin import BaseView from importlib import import_module from airflow.exceptions import AirflowException settings.initialize() +# Delay the import of airflow.models to be after the settings initialization to make sure that +# any reference to a settings' functions (e.g task_instance_mutation_hook) holds the expected implementation +from airflow.models import DAG # noqa: E402 login = None # type: Any +log = logging.getLogger(__name__) def load_login(): - log = LoggingMixin().log auth_backend = 'airflow.default_login' try: diff --git a/airflow/_vendor/README b/airflow/_vendor/README deleted file mode 100644 index a79ea89eae536..0000000000000 --- a/airflow/_vendor/README +++ /dev/null @@ -1,13 +0,0 @@ -Original files in this directory were created with the following commands:: - - mkdir -p slugify/ - curl -fsSL -O https://files.pythonhosted.org/packages/1f/9c/8b07d625e9c9df567986d887f0375075abb1923e49d074a7803cd1527dae/python-slugify-2.0.1.tar.gz - tar -xzf python-slugify-*.tar.gz --strip-components=2 -C slugify/ '*/slugify/*' - tar -xzf python-slugify-*.tar.gz --strip-components=1 -C slugify/ '*/LICENSE' - rm *.tar.gz - - mkdir -p nvd3/ - curl -fsSL -O https://files.pythonhosted.org/packages/0b/aa/97165daa6e319409c5c2582e62736a7353bda3c90d90fdcb0b11e116dd2d/python-nvd3-0.15.0.tar.gz - tar -xzf python-nvd3-*.tar.gz --strip-components=2 -C nvd3/ '*/nvd3/*' - tar -xzf python-nvd3-*.tar.gz --strip-components=1 -C nvd3/ '*/LICENSE' - rm *.tar.gz diff --git a/airflow/_vendor/nvd3/LICENSE b/airflow/_vendor/nvd3/LICENSE deleted file mode 100644 index 1add6249e57b4..0000000000000 --- a/airflow/_vendor/nvd3/LICENSE +++ /dev/null @@ -1,24 +0,0 @@ -The MIT License (MIT) - -Python-nvd3 - -Copyright (c) 2013 Arezqui Belaid and other contributors - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/airflow/_vendor/nvd3/NVD3Chart.py b/airflow/_vendor/nvd3/NVD3Chart.py deleted file mode 100644 index faefe5d3a0fcf..0000000000000 --- a/airflow/_vendor/nvd3/NVD3Chart.py +++ /dev/null @@ -1,506 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -from __future__ import unicode_literals -from optparse import OptionParser -from jinja2 import Environment, PackageLoader -from airflow._vendor.slugify import slugify - -try: - import simplejson as json -except ImportError: - import json - -CONTENT_FILENAME = "./content.html" -PAGE_FILENAME = "./page.html" - - -pl = PackageLoader('airflow._vendor.nvd3', 'templates') -jinja2_env = Environment(lstrip_blocks=True, trim_blocks=True, loader=pl) - -template_content = jinja2_env.get_template(CONTENT_FILENAME) -template_page = jinja2_env.get_template(PAGE_FILENAME) - - -def stab(tab=1): - """ - create space tabulation - """ - return ' ' * 4 * tab - - -class NVD3Chart(object): - """ - NVD3Chart Base class. - """ - #: chart count - count = 0 - #: directory holding the assets (bower_components) - assets_directory = './bower_components/' - - # this attribute is overriden by children of this - # class - CHART_FILENAME = None - template_environment = Environment(lstrip_blocks=True, trim_blocks=True, - loader=pl) - - def __init__(self, **kwargs): - """ - This is the base class for all the charts. The following keywords are - accepted: - - :keyword: **display_container** - default: ``True`` - :keyword: **jquery_on_ready** - default: ``False`` - :keyword: **charttooltip_dateformat** - default: ``'%d %b %Y'`` - :keyword: **name** - default: the class name - ``model`` - set the model (e.g. ``pieChart``, ` - ``LineWithFocusChart``, ``MultiBarChart``). - :keyword: **color_category** - default - ``None`` - :keyword: **color_list** - default - ``None`` - used by pieChart (e.g. ``['red', 'blue', 'orange']``) - :keyword: **margin_bottom** - default - ``20`` - :keyword: **margin_left** - default - ``60`` - :keyword: **margin_right** - default - ``60`` - :keyword: **margin_top** - default - ``30`` - :keyword: **height** - default - ``''`` - :keyword: **width** - default - ``''`` - :keyword: **stacked** - default - ``False`` - :keyword: **focus_enable** - default - ``False`` - :keyword: **resize** - define - ``False`` - :keyword: **show_legend** - default - ``True`` - :keyword: **show_labels** - default - ``True`` - :keyword: **tag_script_js** - default - ``True`` - :keyword: **use_interactive_guideline** - default - ``False`` - :keyword: **chart_attr** - default - ``None`` - :keyword: **extras** - default - ``None`` - - Extra chart modifiers. Use this to modify different attributes of - the chart. - :keyword: **x_axis_date** - default - False - Signal that x axis is a date axis - :keyword: **date_format** - default - ``%x`` - see https://github.com/mbostock/d3/wiki/Time-Formatting - :keyword: **x_axis_format** - default - ``''``. - :keyword: **y_axis_format** - default - ``''``. - :keyword: **style** - default - ``''`` - Style modifiers for the DIV container. - :keyword: **color_category** - default - ``category10`` - - Acceptable values are nvd3 categories such as - ``category10``, ``category20``, ``category20c``. - """ - # set the model - self.model = self.__class__.__name__ #: The chart model, - - #: an Instance of Jinja2 template - self.template_page_nvd3 = template_page - self.template_content_nvd3 = template_content - self.series = [] - self.axislist = {} - # accepted keywords - self.display_container = kwargs.get('display_container', True) - self.charttooltip_dateformat = kwargs.get('charttooltip_dateformat', - '%d %b %Y') - self._slugify_name(kwargs.get('name', self.model)) - self.jquery_on_ready = kwargs.get('jquery_on_ready', False) - self.color_category = kwargs.get('color_category', None) - self.color_list = kwargs.get('color_list', None) - self.margin_bottom = kwargs.get('margin_bottom', 20) - self.margin_left = kwargs.get('margin_left', 60) - self.margin_right = kwargs.get('margin_right', 60) - self.margin_top = kwargs.get('margin_top', 30) - self.height = kwargs.get('height', '') - self.width = kwargs.get('width', '') - self.stacked = kwargs.get('stacked', False) - self.focus_enable = kwargs.get('focus_enable', False) - self.resize = kwargs.get('resize', False) - self.show_legend = kwargs.get('show_legend', True) - self.show_labels = kwargs.get('show_labels', True) - self.tag_script_js = kwargs.get('tag_script_js', True) - self.use_interactive_guideline = kwargs.get("use_interactive_guideline", - False) - self.chart_attr = kwargs.get("chart_attr", {}) - self.extras = kwargs.get('extras', None) - self.style = kwargs.get('style', '') - self.date_format = kwargs.get('date_format', '%x') - self.x_axis_date = kwargs.get('x_axis_date', False) - #: x-axis contain date format or not - # possible duplicate of x_axis_date - self.date_flag = kwargs.get('date_flag', False) - self.x_axis_format = kwargs.get('x_axis_format', '') - # Load remote JS assets or use the local bower assets? - self.remote_js_assets = kwargs.get('remote_js_assets', True) - - # None keywords attribute that should be modified by methods - # We should change all these to _attr - - self.htmlcontent = '' #: written by buildhtml - self.htmlheader = '' - #: Place holder for the graph (the HTML div) - #: Written by ``buildcontainer`` - self.container = u'' - #: Header for javascript code - self.containerheader = u'' - # CDN http://cdnjs.com/libraries/nvd3/ needs to make sure it's up to - # date - self.header_css = [ - '' % h for h in - ( - 'https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.css' if self.remote_js_assets else self.assets_directory + 'nvd3/src/nv.d3.css', - ) - ] - - self.header_js = [ - '' % h for h in - ( - 'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js' if self.remote_js_assets else self.assets_directory + 'd3/d3.min.js', - 'https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.js' if self.remote_js_assets else self.assets_directory + 'nvd3/nv.d3.min.js' - ) - ] - - #: Javascript code as string - self.jschart = None - self.custom_tooltip_flag = False - self.tooltip_condition_string = '' - self.charttooltip = '' - self.serie_no = 1 - - def _slugify_name(self, name): - """Slufigy name with underscore""" - self.name = slugify(name, separator='_') - - def add_serie(self, y, x, name=None, extra=None, **kwargs): - """ - add serie - Series are list of data that will be plotted - y {1, 2, 3, 4, 5} / x {1, 2, 3, 4, 5} - - **Attributes**: - - * ``name`` - set Serie name - * ``x`` - x-axis data - * ``y`` - y-axis data - - kwargs: - - * ``shape`` - for scatterChart, you can set different shapes - (circle, triangle etc...) - * ``size`` - for scatterChart, you can set size of different shapes - * ``type`` - for multiChart, type should be bar - * ``bar`` - to display bars in Chart - * ``color_list`` - define list of colors which will be - used by pieChart - * ``color`` - set axis color - * ``disabled`` - - - extra: - - * ``tooltip`` - set tooltip flag - * ``date_format`` - set date_format for tooltip if x-axis is in - date format - - """ - if not name: - name = "Serie %d" % (self.serie_no) - - # For scatterChart shape & size fields are added in serie - if 'shape' in kwargs or 'size' in kwargs: - csize = kwargs.get('size', 1) - cshape = kwargs.get('shape', 'circle') - - serie = [{ - 'x': x[i], - 'y': j, - 'shape': cshape, - 'size': csize[i] if isinstance(csize, list) else csize - } for i, j in enumerate(y)] - else: - if self.model == 'pieChart': - serie = [{'label': x[i], 'value': y} for i, y in enumerate(y)] - else: - serie = [{'x': x[i], 'y': y} for i, y in enumerate(y)] - - data_keyvalue = {'values': serie, 'key': name} - - # multiChart - # Histogram type='bar' for the series - if 'type' in kwargs and kwargs['type']: - data_keyvalue['type'] = kwargs['type'] - - # Define on which Y axis the serie is related - # a chart can have 2 Y axis, left and right, by default only one Y Axis is used - if 'yaxis' in kwargs and kwargs['yaxis']: - data_keyvalue['yAxis'] = kwargs['yaxis'] - else: - if self.model != 'pieChart': - data_keyvalue['yAxis'] = '1' - - if 'bar' in kwargs and kwargs['bar']: - data_keyvalue['bar'] = 'true' - - if 'disabled' in kwargs and kwargs['disabled']: - data_keyvalue['disabled'] = 'true' - - if 'color' in kwargs and kwargs['color']: - data_keyvalue['color'] = kwargs['color'] - - if extra: - if self.model == 'pieChart': - if 'color_list' in extra and extra['color_list']: - self.color_list = extra['color_list'] - - if extra.get('date_format'): - self.charttooltip_dateformat = extra['date_format'] - - if extra.get('tooltip'): - self.custom_tooltip_flag = True - - if self.model != 'pieChart': - _start = extra['tooltip']['y_start'] - _end = extra['tooltip']['y_end'] - _start = ("'" + str(_start) + "' + ") if _start else '' - _end = (" + '" + str(_end) + "'") if _end else '' - - if self.model == 'linePlusBarChart': - if self.tooltip_condition_string: - self.tooltip_condition_string += stab(5) - self.tooltip_condition_string += stab(0) + "if(key.indexOf('" + name + "') > -1 ){\n" +\ - stab(6) + "var y = " + _start + " String(graph.point.y) " + _end + ";\n" +\ - stab(5) + "}\n" - elif self.model == 'cumulativeLineChart': - self.tooltip_condition_string += stab(0) + "if(key == '" + name + "'){\n" +\ - stab(6) + "var y = " + _start + " String(e) " + _end + ";\n" +\ - stab(5) + "}\n" - else: - self.tooltip_condition_string += stab(5) + "if(key == '" + name + "'){\n" +\ - stab(6) + "var y = " + _start + " String(graph.point.y) " + _end + ";\n" +\ - stab(5) + "}\n" - - if self.model == 'pieChart': - _start = extra['tooltip']['y_start'] - _end = extra['tooltip']['y_end'] - _start = ("'" + str(_start) + "' + ") if _start else '' - _end = (" + '" + str(_end) + "'") if _end else '' - self.tooltip_condition_string += "var y = " + _start + " String(y) " + _end + ";\n" - - # Increment series counter & append - self.serie_no += 1 - self.series.append(data_keyvalue) - - def add_chart_extras(self, extras): - """ - Use this method to add extra d3 properties to your chart. - For example, you want to change the text color of the graph:: - - chart = pieChart(name='pieChart', color_category='category20c', height=400, width=400) - - xdata = ["Orange", "Banana", "Pear", "Kiwi", "Apple", "Strawberry", "Pineapple"] - ydata = [3, 4, 0, 1, 5, 7, 3] - - extra_serie = {"tooltip": {"y_start": "", "y_end": " cal"}} - chart.add_serie(y=ydata, x=xdata, extra=extra_serie) - - The above code will create graph with a black text, the following will change it:: - - text_white="d3.selectAll('#pieChart text').style('fill', 'white');" - chart.add_chart_extras(text_white) - - The above extras will be appended to the java script generated. - - Alternatively, you can use the following initialization:: - - chart = pieChart(name='pieChart', - color_category='category20c', - height=400, width=400, - extras=text_white) - """ - self.extras = extras - - def set_graph_height(self, height): - """Set Graph height""" - self.height = str(height) - - def set_graph_width(self, width): - """Set Graph width""" - self.width = str(width) - - def set_containerheader(self, containerheader): - """Set containerheader""" - self.containerheader = containerheader - - def set_date_flag(self, date_flag=False): - """Set date flag""" - self.date_flag = date_flag - - def set_custom_tooltip_flag(self, custom_tooltip_flag): - """Set custom_tooltip_flag & date_flag""" - self.custom_tooltip_flag = custom_tooltip_flag - - def __str__(self): - """return htmlcontent""" - self.buildhtml() - return self.htmlcontent - - def buildcontent(self): - """Build HTML content only, no header or body tags. To be useful this - will usually require the attribute `juqery_on_ready` to be set which - will wrap the js in $(function(){};) - """ - self.buildcontainer() - # if the subclass has a method buildjs this method will be - # called instead of the method defined here - # when this subclass method is entered it does call - # the method buildjschart defined here - self.buildjschart() - self.htmlcontent = self.template_content_nvd3.render(chart=self) - - def buildhtml(self): - """Build the HTML page - Create the htmlheader with css / js - Create html page - Add Js code for nvd3 - """ - self.buildcontent() - self.content = self.htmlcontent - self.htmlcontent = self.template_page_nvd3.render(chart=self) - - # this is used by django-nvd3 - def buildhtmlheader(self): - """generate HTML header content""" - self.htmlheader = '' - # If the JavaScript assets have already been injected, don't bother re-sourcing them. - global _js_initialized - if '_js_initialized' not in globals() or not _js_initialized: - for css in self.header_css: - self.htmlheader += css - for js in self.header_js: - self.htmlheader += js - - def buildcontainer(self): - """generate HTML div""" - if self.container: - return - - # Create SVG div with style - if self.width: - if self.width[-1] != '%': - self.style += 'width:%spx;' % self.width - else: - self.style += 'width:%s;' % self.width - if self.height: - if self.height[-1] != '%': - self.style += 'height:%spx;' % self.height - else: - self.style += 'height:%s;' % self.height - if self.style: - self.style = 'style="%s"' % self.style - - self.container = self.containerheader + \ - '
\n' % (self.name, self.style) - - def buildjschart(self): - """generate javascript code for the chart""" - self.jschart = '' - - # add custom tooltip string in jschart - # default condition (if build_custom_tooltip is not called explicitly with date_flag=True) - if self.tooltip_condition_string == '': - self.tooltip_condition_string = 'var y = String(graph.point.y);\n' - - # Include data - self.series_js = json.dumps(self.series) - - def create_x_axis(self, name, label=None, format=None, date=False, custom_format=False): - """Create X-axis""" - axis = {} - if custom_format and format: - axis['tickFormat'] = format - elif format: - if format == 'AM_PM': - axis['tickFormat'] = "function(d) { return get_am_pm(parseInt(d)); }" - else: - axis['tickFormat'] = "d3.format(',%s')" % format - - if label: - axis['axisLabel'] = "'" + label + "'" - - # date format : see https://github.com/mbostock/d3/wiki/Time-Formatting - if date: - self.dateformat = format - axis['tickFormat'] = ("function(d) { return d3.time.format('%s')" - "(new Date(parseInt(d))) }\n" - "" % self.dateformat) - # flag is the x Axis is a date - if name[0] == 'x': - self.x_axis_date = True - - # Add new axis to list of axis - self.axislist[name] = axis - - # Create x2Axis if focus_enable - if name == "xAxis" and self.focus_enable: - self.axislist['x2Axis'] = axis - - def create_y_axis(self, name, label=None, format=None, custom_format=False): - """ - Create Y-axis - """ - axis = {} - - if custom_format and format: - axis['tickFormat'] = format - elif format: - axis['tickFormat'] = "d3.format(',%s')" % format - - if label: - axis['axisLabel'] = "'" + label + "'" - - # Add new axis to list of axis - self.axislist[name] = axis - - -class TemplateMixin(object): - """ - A mixin that override buildcontent. Instead of building the complex - content template we exploit Jinja2 inheritance. Thus each chart class - renders it's own chart template which inherits from content.html - """ - def buildcontent(self): - """Build HTML content only, no header or body tags. To be useful this - will usually require the attribute `juqery_on_ready` to be set which - will wrap the js in $(function(){};) - """ - self.buildcontainer() - # if the subclass has a method buildjs this method will be - # called instead of the method defined here - # when this subclass method is entered it does call - # the method buildjschart defined here - self.buildjschart() - self.htmlcontent = self.template_chart_nvd3.render(chart=self) - - -def _main(): - """ - Parse options and process commands - """ - # Parse arguments - usage = "usage: nvd3.py [options]" - parser = OptionParser(usage=usage, - version=("python-nvd3 - Charts generator with " - "nvd3.js and d3.js")) - parser.add_option("-q", "--quiet", - action="store_false", dest="verbose", default=True, - help="don't print messages to stdout") - - (options, args) = parser.parse_args() - - -if __name__ == '__main__': - _main() diff --git a/airflow/_vendor/nvd3/__init__.py b/airflow/_vendor/nvd3/__init__.py deleted file mode 100755 index 5b737b45361ad..0000000000000 --- a/airflow/_vendor/nvd3/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -__version__ = '0.15.0' -__all__ = ['lineChart', 'pieChart', 'lineWithFocusChart', - 'stackedAreaChart', 'multiBarHorizontalChart', - 'linePlusBarChart', 'cumulativeLineChart', - 'scatterChart', 'discreteBarChart', 'multiBarChart'] - - -from .lineChart import lineChart -from .pieChart import pieChart -from .lineWithFocusChart import lineWithFocusChart -from .stackedAreaChart import stackedAreaChart -from .multiBarHorizontalChart import multiBarHorizontalChart -from .linePlusBarChart import linePlusBarChart -from .cumulativeLineChart import cumulativeLineChart -from .scatterChart import scatterChart -from .discreteBarChart import discreteBarChart -from .multiBarChart import multiBarChart -from . import ipynb diff --git a/airflow/_vendor/nvd3/cumulativeLineChart.py b/airflow/_vendor/nvd3/cumulativeLineChart.py deleted file mode 100644 index d98d0867e4d99..0000000000000 --- a/airflow/_vendor/nvd3/cumulativeLineChart.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -from .NVD3Chart import NVD3Chart, TemplateMixin - - -class cumulativeLineChart(TemplateMixin, NVD3Chart): - """ - A cumulative line chart is used when you have one important grouping representing - an ordered set of data and one value to show, summed over time. - - Python example:: - - from nvd3 import cumulativeLineChart - chart = cumulativeLineChart(name='cumulativeLineChart', x_is_date=True) - xdata = [1365026400000000, 1365026500000000, 1365026600000000] - ydata = [6, 5, 1] - y2data = [36, 55, 11] - - extra_serie = {"tooltip": {"y_start": "There are ", "y_end": " calls"}} - chart.add_serie(name="Serie 1", y=ydata, x=xdata, extra=extra_serie) - - extra_serie = {"tooltip": {"y_start": "", "y_end": " mins"}} - chart.add_serie(name="Serie 2", y=y2data, x=xdata, extra=extra_serie) - chart.buildhtml() - - Javascript generated: - - .. raw:: html - -
- - - """ - - CHART_FILENAME = "./cumulativelinechart.html" - template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) - - def __init__(self, **kwargs): - super(cumulativeLineChart, self).__init__(**kwargs) - self.model = 'cumulativeLineChart' - - height = kwargs.get('height', 450) - width = kwargs.get('width', None) - - if kwargs.get('x_is_date', False): - self.set_date_flag(True) - self.create_x_axis('xAxis', - format=kwargs.get('x_axis_format', '%d %b %Y'), - date=True) - self.set_custom_tooltip_flag(True) - else: - self.create_x_axis('xAxis', format=kwargs.get( - 'x_axis_format', '.2f')) - - self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.1%')) - - self.set_graph_height(height) - if width: - self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/discreteBarChart.py b/airflow/_vendor/nvd3/discreteBarChart.py deleted file mode 100644 index cf6c8a4a8ff4b..0000000000000 --- a/airflow/_vendor/nvd3/discreteBarChart.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -from .NVD3Chart import NVD3Chart, TemplateMixin - - -class discreteBarChart(TemplateMixin, NVD3Chart): - """ - A discrete bar chart or bar graph is a chart with rectangular bars with - lengths proportional to the values that they represent. - - Python example:: - - from nvd3 import discreteBarChart - chart = discreteBarChart(name='discreteBarChart', height=400, width=400) - - xdata = ["A", "B", "C", "D", "E", "F"] - ydata = [3, 4, 0, -3, 5, 7] - - chart.add_serie(y=ydata, x=xdata) - chart.buildhtml() - - Javascript generated: - - .. raw:: html - -
- - - - """ - CHART_FILENAME = "./discretebarchart.html" - template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) - - def __init__(self, **kwargs): - super(discreteBarChart, self).__init__(**kwargs) - self.model = 'discreteBarChart' - height = kwargs.get('height', 450) - width = kwargs.get('width', None) - - if kwargs.get('x_is_date', False): - self.set_date_flag(True) - self.create_x_axis('xAxis', - format=kwargs.get('x_axis_format', - "%d %b %Y %H %S"), - date=True) - else: - self.create_x_axis('xAxis', format=None) - - self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', ".0f")) - - self.set_custom_tooltip_flag(True) - - self.set_graph_height(height) - if width: - self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/ipynb.py b/airflow/_vendor/nvd3/ipynb.py deleted file mode 100644 index f421afc0a8a50..0000000000000 --- a/airflow/_vendor/nvd3/ipynb.py +++ /dev/null @@ -1,91 +0,0 @@ -''' -ipython compatability module for nvd3-python -This adds simple ipython compatibility to the nvd3-python package, without making any -major modifications to how the main package is structured. It utilizes the IPython -display-formatter functionality, as described at: -http://nbviewer.ipython.org/github/ipython/ipython/blob/master/examples/notebooks/Custom%20Display%20Logic.ipynb -For additional examples, see: -https://github.com/sympy/sympy/blob/master/sympy/interactive/printing.py -''' - -try: - _ip = get_ipython() -except: - _ip = None -if _ip and _ip.__module__.lower().startswith('ipy'): - global _js_initialized - _js_initialized = False - - def _print_html(chart): - '''Function to return the HTML code for the div container plus the javascript - to generate the chart. This function is bound to the ipython formatter so that - charts are displayed inline.''' - global _js_initialized - if not _js_initialized: - print('js not initialized - pausing to allow time for it to load...') - initialize_javascript() - import time - time.sleep(5) - chart.buildhtml() - return chart.htmlcontent - - def _setup_ipython_formatter(ip): - ''' Set up the ipython formatter to display HTML formatted output inline''' - from IPython import __version__ as IPython_version - from nvd3 import __all__ as nvd3_all - - if IPython_version >= '0.11': - html_formatter = ip.display_formatter.formatters['text/html'] - for chart_type in nvd3_all: - html_formatter.for_type_by_name('nvd3.' + chart_type, chart_type, _print_html) - - def initialize_javascript(d3_js_url='https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js', - nvd3_js_url='https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.js', - nvd3_css_url='https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.css', - use_remote=False): - '''Initialize the ipython notebook to be able to display nvd3 results. - by instructing IPython to load the nvd3 JS and css files, and the d3 JS file. - - by default, it looks for the files in your IPython Notebook working directory. - - Takes the following options: - - use_remote: use remote hosts for d3.js, nvd3.js, and nv.d3.css (default False) - * Note: the following options are ignored if use_remote is False: - nvd3_css_url: location of nvd3 css file (default https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.css) - nvd3_js_url: location of nvd3 javascript file (default https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.css) - d3_js_url: location of d3 javascript file (default https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js) - ''' - from IPython.display import display, Javascript, HTML - - if not use_remote: - # these file locations are for IPython 1.x, and will probably change when 2.x is released - d3_js_url = 'files/d3.v3.js' - nvd3_js_url = 'files/nv.d3.js' - nvd3_css_url = 'files/nv.d3.css' - - # load the required javascript files - - #display(Javascript('''$.getScript("%s")''' %(d3_js_url))) - display(HTML('''''' % (nvd3_css_url))) - # The following two methods for loading the script file are redundant. - # This is intentional. - # Ipython's loading of javscript in version 1.x is a bit squirrely, especially - # when creating demos to view in nbviewer. - # by trying twice, in two different ways (one using jquery and one using plain old - # HTML), we maximize our chances of successfully loading the script. - display(Javascript('''$.getScript("%s")''' % (nvd3_js_url))) - display(Javascript('''$.getScript("%s", function() { - $.getScript("%s", function() {})});''' % (d3_js_url, nvd3_js_url))) - display(HTML('' % (d3_js_url))) - display(HTML('' % (nvd3_js_url))) - - global _js_initialized - _js_initialized = True - - print('loaded nvd3 IPython extension\n' - 'run nvd3.ipynb.initialize_javascript() to set up the notebook\n' - 'help(nvd3.ipynb.initialize_javascript) for options') - - _setup_ipython_formatter(_ip) diff --git a/airflow/_vendor/nvd3/lineChart.py b/airflow/_vendor/nvd3/lineChart.py deleted file mode 100644 index c237d069802ad..0000000000000 --- a/airflow/_vendor/nvd3/lineChart.py +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -from .NVD3Chart import NVD3Chart, TemplateMixin - - -class lineChart(TemplateMixin, NVD3Chart): - - """ - A line chart or line graph is a type of chart which displays information - as a series of data points connected by straight line segments. - - Python example:: - - from nvd3 import lineChart - chart = lineChart(name="lineChart", x_is_date=False, x_axis_format="AM_PM") - - xdata = range(24) - ydata = [0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 4, 3, 3, 5, 7, 5, 3, 16, 6, 9, 15, 4, 12] - ydata2 = [9, 8, 11, 8, 3, 7, 10, 8, 6, 6, 9, 6, 5, 4, 3, 10, 0, 6, 3, 1, 0, 0, 0, 1] - - extra_serie = {"tooltip": {"y_start": "There are ", "y_end": " calls"}} - chart.add_serie(y=ydata, x=xdata, name='sine', extra=extra_serie, **kwargs1) - extra_serie = {"tooltip": {"y_start": "", "y_end": " min"}} - chart.add_serie(y=ydata2, x=xdata, name='cose', extra=extra_serie, **kwargs2) - chart.buildhtml() - - Javascript renderd to: - - .. raw:: html - -
- - - See the source code of this page, to see the underlying javascript. - """ - CHART_FILENAME = "./linechart.html" - template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) - - def __init__(self, **kwargs): - super(lineChart, self).__init__(**kwargs) - self.model = 'lineChart' - - height = kwargs.get('height', 450) - width = kwargs.get('width', None) - - if kwargs.get('x_is_date', False): - self.set_date_flag(True) - self.create_x_axis('xAxis', - format=kwargs.get('x_axis_format', '%d %b %Y'), - date=True) - self.set_custom_tooltip_flag(True) - else: - if kwargs.get('x_axis_format') == 'AM_PM': - self.x_axis_format = format = 'AM_PM' - else: - format = kwargs.get('x_axis_format', 'r') - self.create_x_axis('xAxis', format=format, - custom_format=kwargs.get('x_custom_format', - False)) - self.create_y_axis( - 'yAxis', - format=kwargs.get('y_axis_format', '.02f'), - custom_format=kwargs.get('y_custom_format', False)) - - # must have a specified height, otherwise it superimposes both chars - self.set_graph_height(height) - if width: - self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/linePlusBarChart.py b/airflow/_vendor/nvd3/linePlusBarChart.py deleted file mode 100644 index 4eaa5fc6ffdbf..0000000000000 --- a/airflow/_vendor/nvd3/linePlusBarChart.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -from .NVD3Chart import NVD3Chart, TemplateMixin - - -class linePlusBarChart(TemplateMixin, NVD3Chart): - - """ - A linePlusBarChart Chart is a type of chart which displays information - as a series of data points connected by straight line segments - and with some series with rectangular bars with lengths proportional - to the values that they represent. - - Python example:: - - from nvd3 import linePlusBarChart - chart = linePlusBarChart(name="linePlusBarChart", - width=500, height=400, x_axis_format="%d %b %Y", - x_is_date=True, focus_enable=True, - yaxis2_format="function(d) { return d3.format(',0.3f')(d) }") - - xdata = [1338501600000, 1345501600000, 1353501600000] - ydata = [6, 5, 1] - y2data = [0.002, 0.003, 0.004] - - extra_serie = {"tooltip": {"y_start": "There are ", "y_end": " calls"}, - "date_format": "%d %b %Y %H:%S" } - chart.add_serie(name="Serie 1", y=ydata, x=xdata, extra=extra_serie, - bar=True) - - extra_serie = {"tooltip": {"y_start": "There are ", "y_end": " min"}} - chart.add_serie(name="Serie 2", y=y2data, x=xdata, extra=extra_serie) - chart.buildcontent() - - Note that in case you have two data serie with extreme different numbers, - that you would like to format in different ways, - you can pass a keyword *yaxis1_format* or *yaxis2_format* when - creating the graph. - - In the example above the graph created presents the values of the second - data series with three digits right of the decimal point. - - Javascript generated: - - .. raw:: html - -
- - - """ - CHART_FILENAME = "./lineplusbarchart.html" - template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) - - def __init__(self, **kwargs): - super(linePlusBarChart, self).__init__(**kwargs) - self.model = 'linePlusBarChart' - - height = kwargs.get('height', 450) - width = kwargs.get('width', None) - self.yaxis1_format = kwargs.get('yaxis1_format', - "function(d) { return d3.format(',f')(d) }") - self.yaxis2_format = kwargs.get('yaxis2_format', - "function(d) { return d3.format(',f')(d) }") - - if kwargs.get('x_is_date', False): - self.set_date_flag(True) - self.create_x_axis('xAxis', - format=kwargs.get('x_axis_format', - '%d %b %Y %H %S'), - date=True) - self.create_x_axis('x2Axis', format=kwargs.get('x_axis_format', - '%d %b %Y %H %S'), - date=True) - self.set_custom_tooltip_flag(True) - else: - self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', - '.2f')) - self.create_x_axis('x2Axis', format=kwargs.get('x_axis_format', - '.2f')) - - self.create_y_axis('y1Axis', format=self.yaxis1_format, - custom_format=True) - self.create_y_axis('y2Axis', format=self.yaxis2_format, - custom_format=True) - - self.set_graph_height(height) - if width: - self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/lineWithFocusChart.py b/airflow/_vendor/nvd3/lineWithFocusChart.py deleted file mode 100644 index cd26cd4716652..0000000000000 --- a/airflow/_vendor/nvd3/lineWithFocusChart.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -from .NVD3Chart import NVD3Chart, TemplateMixin - - -class lineWithFocusChart(TemplateMixin, NVD3Chart): - """ - A lineWithFocusChart or line graph is a type of chart which displays information - as a series of data points connected by straight line segments. - The lineWithFocusChart provide a smaller chart that act as a selector, - this is very useful if you want to zoom on a specific time period. - - Python example:: - - from nvd3 import lineWithFocusChart - chart = lineWithFocusChart(name='lineWithFocusChart', x_is_date=True, x_axis_format="%d %b %Y") - xdata = [1365026400000000, 1365026500000000, 1365026600000000, 1365026700000000, 1365026800000000, 1365026900000000, 1365027000000000] - ydata = [-6, 5, -1, 2, 4, 8, 10] - - extra_serie = {"tooltip": {"y_start": "", "y_end": " ext"}, - "date_format": "%d %b %Y"} - chart.add_serie(name="Serie 1", y=ydata, x=xdata, extra=extra_serie) - chart.buildhtml() - - Javascript generated: - - .. raw:: html - -
- - - """ - - CHART_FILENAME = "./linewfocuschart.html" - template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) - - def __init__(self, **kwargs): - super(lineWithFocusChart, self).__init__(**kwargs) - self.model = 'lineWithFocusChart' - - height = kwargs.get('height', 450) - width = kwargs.get('width', None) - - if kwargs.get('x_is_date', False): - self.set_date_flag(True) - self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', - '%d %b %Y %H %S'), - date=True) - self.create_x_axis('x2Axis', format=kwargs.get('x_axis_format', - '%d %b %Y %H %S'), - date=True) - self.set_custom_tooltip_flag(True) - else: - self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', - '.2f')) - self.create_x_axis('x2Axis', format=kwargs.get('x_axis_format', - '.2f')) - - self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.2f')) - self.create_y_axis('y2Axis', format=kwargs.get('y_axis_format', '.2f')) - - self.set_graph_height(height) - if width: - self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/multiBarChart.py b/airflow/_vendor/nvd3/multiBarChart.py deleted file mode 100644 index cf335919a84c7..0000000000000 --- a/airflow/_vendor/nvd3/multiBarChart.py +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -from .NVD3Chart import NVD3Chart, TemplateMixin - - -class multiBarChart(TemplateMixin, NVD3Chart): - """ - A multiple bar graph contains comparisons of two or more categories or bars. - One axis represents a quantity and the other axis identifies a specific feature - about the categories. Reading a multiple bar graph includes looking at extremes - (tallest/longest vs. shortest) in each grouping. - - Python example:: - - from nvd3 import multiBarChart - chart = multiBarChart(width=500, height=400, x_axis_format=None) - xdata = ['one', 'two', 'three', 'four'] - ydata1 = [6, 12, 9, 16] - ydata2 = [8, 14, 7, 11] - - chart.add_serie(name="Serie 1", y=ydata1, x=xdata) - chart.add_serie(name="Serie 2", y=ydata2, x=xdata) - chart.buildhtml() - - Javascript generated: - - .. raw:: html - -
- - - """ - - CHART_FILENAME = "./multibarchart.html" - template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) - - def __init__(self, **kwargs): - super(multiBarChart, self).__init__(**kwargs) - - height = kwargs.get('height', 450) - width = kwargs.get('width', None) - - if kwargs.get('x_is_date', False): - self.set_date_flag(True) - self.create_x_axis('xAxis', - format=kwargs.get('x_axis_format', '%d %b %Y'), - date=True) - self.set_custom_tooltip_flag(True) - else: - self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', '.2f')) - self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.2f')) - - self.set_graph_height(height) - if width: - self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/multiBarHorizontalChart.py b/airflow/_vendor/nvd3/multiBarHorizontalChart.py deleted file mode 100644 index ac969c31b548c..0000000000000 --- a/airflow/_vendor/nvd3/multiBarHorizontalChart.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -from .NVD3Chart import NVD3Chart, TemplateMixin - - -class multiBarHorizontalChart(TemplateMixin, NVD3Chart): - """ - A multiple horizontal bar graph contains comparisons of two or more categories or bars. - - Python example:: - - from nvd3 import multiBarHorizontalChart - chart = multiBarHorizontalChart(name='multiBarHorizontalChart', height=400, width=400) - xdata = [-14, -7, 7, 14] - ydata = [-6, 5, -1, 9] - y2data = [-23, -6, -32, 9] - - extra_serie = {"tooltip": {"y_start": "", "y_end": " balls"}} - chart.add_serie(name="Serie 1", y=ydata, x=xdata, extra=extra_serie) - - extra_serie = {"tooltip": {"y_start": "", "y_end": " calls"}} - chart.add_serie(name="Serie 2", y=y2data, x=xdata, extra=extra_serie) - chart.buildcontent() - - Javascript generated: - - .. raw:: html - -
- - - """ - - CHART_FILENAME = "./multibarcharthorizontal.html" - template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) - - def __init__(self, **kwargs): - super(multiBarHorizontalChart, self).__init__(**kwargs) - height = kwargs.get('height', 450) - width = kwargs.get('width', None) - - self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', '.2f')) - self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.2f')) - - self.set_graph_height(height) - if width: - self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/pieChart.py b/airflow/_vendor/nvd3/pieChart.py deleted file mode 100644 index 1db76bdb3424c..0000000000000 --- a/airflow/_vendor/nvd3/pieChart.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -from .NVD3Chart import NVD3Chart, TemplateMixin - - -class pieChart(TemplateMixin, NVD3Chart): - - """ - A pie chart (or a circle graph) is a circular chart divided into sectors, - illustrating numerical proportion. In chart, the arc length of each sector - is proportional to the quantity it represents. - - Python example:: - - from nvd3 import pieChart - chart = pieChart(name='pieChart', color_category='category20c', - height=400, width=400) - - xdata = ["Orange", "Banana", "Pear", "Kiwi", "Apple", "Strawbery", - "Pineapple"] - ydata = [3, 4, 0, 1, 5, 7, 3] - - extra_serie = {"tooltip": {"y_start": "", "y_end": " cal"}} - chart.add_serie(y=ydata, x=xdata, extra=extra_serie) - chart.buildhtml() - - Javascript generated: - - .. raw:: html - -
- - - """ - CHART_FILENAME = "./piechart.html" - template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) - - def __init__(self, **kwargs): - super(pieChart, self).__init__(**kwargs) - - height = kwargs.get('height', 450) - width = kwargs.get('width', None) - self.donut = kwargs.get('donut', False) - self.donutRatio = kwargs.get('donutRatio', 0.35) - self.color_list = [] - self.create_x_axis('xAxis', format=None) - self.create_y_axis('yAxis', format=None) - # must have a specified height, otherwise it superimposes both chars - if height: - self.set_graph_height(height) - if width: - self.set_graph_width(width) - self.donut = kwargs.get('donut', False) - self.donutRatio = kwargs.get('donutRatio', 0.35) diff --git a/airflow/_vendor/nvd3/scatterChart.py b/airflow/_vendor/nvd3/scatterChart.py deleted file mode 100644 index c3a87d2908bde..0000000000000 --- a/airflow/_vendor/nvd3/scatterChart.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -from .NVD3Chart import NVD3Chart, TemplateMixin - - -class scatterChart(TemplateMixin, NVD3Chart): - - """ - A scatter plot or scattergraph is a type of mathematical diagram using Cartesian - coordinates to display values for two variables for a set of data. - The data is displayed as a collection of points, each having the value of one variable - determining the position on the horizontal axis and the value of the other variable - determining the position on the vertical axis. - - Python example:: - - from nvd3 import scatterChart - chart = scatterChart(name='scatterChart', height=400, width=400) - xdata = [3, 4, 0, -3, 5, 7] - ydata = [-1, 2, 3, 3, 15, 2] - ydata2 = [1, -2, 4, 7, -5, 3] - - kwargs1 = {'shape': 'circle', 'size': '1'} - kwargs2 = {'shape': 'cross', 'size': '10'} - - extra_serie = {"tooltip": {"y_start": "", "y_end": " call"}} - chart.add_serie(name="series 1", y=ydata, x=xdata, extra=extra_serie, **kwargs1) - - extra_serie = {"tooltip": {"y_start": "", "y_end": " min"}} - chart.add_serie(name="series 2", y=ydata2, x=xdata, extra=extra_serie, **kwargs2) - chart.buildhtml() - - Javascript generated: - - .. raw:: html - -
- - - """ - - CHART_FILENAME = "./scatterchart.html" - template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) - - def __init__(self, **kwargs): - super(scatterChart, self).__init__(**kwargs) - self.model = 'scatterChart' - height = kwargs.get('height', 450) - width = kwargs.get('width', None) - self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', '.02f'), - label=kwargs.get('x_axis_label', None)) - self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.02f'), - label=kwargs.get('y_axis_label', None)) - self.set_graph_height(height) - if width: - self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/stackedAreaChart.py b/airflow/_vendor/nvd3/stackedAreaChart.py deleted file mode 100644 index 8346cd2c53879..0000000000000 --- a/airflow/_vendor/nvd3/stackedAreaChart.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -""" -Python-nvd3 is a Python wrapper for NVD3 graph library. -NVD3 is an attempt to build re-usable charts and chart components -for d3.js without taking away the power that d3.js gives you. - -Project location : https://github.com/areski/python-nvd3 -""" - -from .NVD3Chart import NVD3Chart, TemplateMixin - - -class stackedAreaChart(TemplateMixin, NVD3Chart): - """ - The stacked area chart is identical to the area chart, except the areas are stacked - on top of each other, rather than overlapping. This can make the chart much easier to read. - - Python example:: - - from nvd3 import stackedAreaChart - chart = stackedAreaChart(name='stackedAreaChart', height=400, width=400) - - xdata = [100, 101, 102, 103, 104, 105, 106,] - ydata = [6, 11, 12, 7, 11, 10, 11] - ydata2 = [8, 20, 16, 12, 20, 28, 28] - - extra_serie = {"tooltip": {"y_start": "There is ", "y_end": " min"}} - chart.add_serie(name="Serie 1", y=ydata, x=xdata, extra=extra_serie) - chart.add_serie(name="Serie 2", y=ydata2, x=xdata, extra=extra_serie) - chart.buildhtml() - - Javascript generated: - - .. raw:: html - -
- - - """ - - CHART_FILENAME = "./stackedareachart.html" - template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) - - def __init__(self, **kwargs): - super(stackedAreaChart, self).__init__(**kwargs) - height = kwargs.get('height', 450) - width = kwargs.get('width', None) - self.model = 'stackedAreaChart' - - if kwargs.get('x_is_date', False): - self.set_date_flag(True) - self.create_x_axis('xAxis', - format=kwargs.get('x_axis_format', '%d %b %Y'), - date=True) - self.set_custom_tooltip_flag(True) - else: - self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', - '.2f')) - self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.2f')) - - self.set_graph_height(height) - if width: - self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/templates/base.html b/airflow/_vendor/nvd3/templates/base.html deleted file mode 100644 index 997e6331625e4..0000000000000 --- a/airflow/_vendor/nvd3/templates/base.html +++ /dev/null @@ -1,35 +0,0 @@ -{% block container %} -{% endblock %} - -{% block start_script %} - {% if chart.tag_script_js %} - - {% endif %} -{% endblock endscript %} diff --git a/airflow/_vendor/nvd3/templates/content.html b/airflow/_vendor/nvd3/templates/content.html deleted file mode 100644 index 787f39b555a4a..0000000000000 --- a/airflow/_vendor/nvd3/templates/content.html +++ /dev/null @@ -1,123 +0,0 @@ -{% extends "base.html" %} -{% block container %} -{% if chart.display_container %} - {{ chart.container }} -{% endif %} -{% endblock container %} - -{% block body %} - {% block data %} - data_{{ chart.name }}={{ chart.series_js }}; - {% endblock data %} - - {% block init %} - nv.addGraph(function() { - var chart = nv.models.{{ chart.model }}(){% if chart.use_interactive_guideline %}.useInteractiveGuideline(true){% endif %}; - - chart.margin({top: {{ chart.margin_top }}, right: {{ chart.margin_right }}, bottom: {{ chart.margin_bottom }}, left: {{ chart.margin_left }}}); - - var datum = data_{{ chart.name }}; - - {% if not chart.color_list and chart.color_category %} - chart.color(d3.scale.{{ chart.color_category }}().range()); - {% endif %} - {% endblock init %} - - {% if chart.stacked %} - chart.stacked(true); - {% endif %} - - {% block focus %} - {% endblock focus %} - - - {% block axes %} - {% for axis, a in chart.axislist.items() %} - {% if a.items() %} - chart.{{ axis }} - {% for attr, value in a.items() %} - .{{ attr}}({{ value}}){% if loop.last %}; - {% endif %} - {% endfor %} - {% endif %} - {% endfor %} - {% endblock axes %} - - {# generate custom tooltip for the chart #} - {% block tooltip %} - {% if chart.custom_tooltip_flag %} - {% if not chart.date_flag %} - {% if chart.model == 'pieChart' %} - {% block pietooltip %} - {% endblock pietooltip %} - {% else %} - chart.tooltipContent(function(key, y, e, graph) { - var x = String(graph.point.x); - var y = String(graph.point.y); - {{ chart.tooltip_condition_string }} - tooltip_str = '
'+key+'
' + y + ' at ' + x; - return tooltip_str; - }); - {% endif %} - {% else %} - chart.tooltipContent(function(key, y, e, graph) { - var x = d3.time.format("{{ chart.charttooltip_dateformat }}")(new Date(parseInt(graph.point.x))); - var y = String(graph.point.y); - {{ chart.tooltip_condition_string }} - tooltip_str = '
'+key+'
' + y + ' on ' + x; - return tooltip_str; - }); - {% endif %} - {% endif %} - {% endblock tooltip %} - - {# the shape attribute in kwargs is not applied when #} - {# not allowing other shapes to be rendered #} - {% block legend %} - chart.showLegend({{chart.show_legend|lower}}); - {% endblock legend %} - - {% block custoattr %} - {# add custom chart attributes #} - {% for attr, value in chart.chart_attr.items() %} - {% if value is string and value.startswith(".") %}: - chart.{{ attr }}{{ value }}; - {% else %} - chart.{{ attr }}({{ value }}); - {% endif %} - {% endfor %} - - {% if chart.resize %} - nv.utils.windowResize(chart.update); - {% endif %} - - {# include specific subchart #} - {{ chart.jschart }} - - {% endblock custoattr %} - - {% block inject %} - {# Inject data to D3 #} - d3.select('#{{ chart.name }} svg') - .datum(datum) - .transition().duration(500) - {% if chart.width %} - .attr('width', {{ chart.width}}) - {% endif %} - {% if chart.height %} - .attr('height', {{ chart.height}}) - {% endif %} - .call(chart); - {% endblock inject %} - - {# extra chart attributes #} - {% if chart.extras %} - {{ chart.extras }} - {% endif %} - - {# closing nv.addGraph #} - {% block close %} - }); - {% endblock close %} - -{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/cumulativelinechart.html b/airflow/_vendor/nvd3/templates/cumulativelinechart.html deleted file mode 100644 index 66b6c74d1e144..0000000000000 --- a/airflow/_vendor/nvd3/templates/cumulativelinechart.html +++ /dev/null @@ -1,10 +0,0 @@ -{# This template adds attributes unique - to cumulativeLineChart #} - -{% extends "content.html" %} -{% block body %} - -{# calling super guarantees everying in content is also found here ...#} -{{super()}} - -{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/discretebarchart.html b/airflow/_vendor/nvd3/templates/discretebarchart.html deleted file mode 100644 index 3a52b3abd32b1..0000000000000 --- a/airflow/_vendor/nvd3/templates/discretebarchart.html +++ /dev/null @@ -1,31 +0,0 @@ -{# This is a dummy template, we can use that template to add attributes unique - to discreteBarChart #} - -{% extends "content.html" %} -{% block body %} - - {% block data %} - {{super()}} - {% endblock data %} - - {% block init %} - {{super()}} - {% endblock init %} - - {% block axes %} - {{super()}} - {% endblock axes %} - - {% block custoattr %} - {{super()}} - {% endblock custoattr %} - - {% block inject %} - {{ super() }} - {% endblock inject %} - - {% block close %} - {{ super() }} - {% endblock close %} - -{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/linebarwfocuschart.html b/airflow/_vendor/nvd3/templates/linebarwfocuschart.html deleted file mode 100644 index ad4866c8153f9..0000000000000 --- a/airflow/_vendor/nvd3/templates/linebarwfocuschart.html +++ /dev/null @@ -1,60 +0,0 @@ -{# This template adds attributes unique - to lineChart #} - -{% extends "content.html" %} -{% block body %} - {% block data %} - data_{{ chart.name }}={{ chart.series_js }}; - {% endblock data %} - - - {% block init %} - {{super()}} - {% endblock init %} - {% block axes %} - {{super()}} - {% endblock axes %} - {% block tooltip %} - {{super()}} - {% endblock tooltip %} - - chart.showLegend({{chart.show_legend|lower}}); - - {# add custom chart attributes #} - {% for attr, value in chart.chart_attr.items() %} - {% if value is string and value.startswith(".") %}: - chart.{{ attr }}{{ value }}; - {% else %} - chart.{{ attr }}({{ value }}); - {% endif %} - {% endfor %} - - {% if chart.x_axis_format == 'AM_PM' %} - function get_am_pm(d){ - if (d > 12) { - d = d - 12; return (String(d) + 'PM'); - } - else { - return (String(d) + 'AM'); - } - }; - {% else %} - chart.x(function(d,i) { return i }); - {% endif %} - - {% if chart.resize %} - nv.utils.windowResize(chart.update); - {% endif %} - {% block inject %} - {{super()}} - {% endblock inject %} - - {% if chart.extras %} - {{ chart.extras }} - {% endif %} - - {% block close %} - }); - {% endblock close %} - -{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/linechart.html b/airflow/_vendor/nvd3/templates/linechart.html deleted file mode 100644 index edd9633314296..0000000000000 --- a/airflow/_vendor/nvd3/templates/linechart.html +++ /dev/null @@ -1,46 +0,0 @@ -{# This template adds attributes unique - to lineChart #} - -{% extends "content.html" %} -{% block body %} - - {% block data %} - {{super()}} - {% endblock data %} - - {% block init %} - {{super()}} - {% endblock init %} - - {% block axes %} - {{super()}} - {% endblock axes %} - - {% if chart.x_axis_format == 'AM_PM' %} - function get_am_pm(d){ - if (d > 12) { - d = d - 12; return (String(d) + 'PM'); - } - else { - return (String(d) + 'AM'); - } - }; - {% endif %} - - {% block legend %} - {{super()}} - {% endblock legend %} - - {% block custoattr %} - {{super()}} - {% endblock custoattr %} - - {% block inject %} - {{ super() }} - {% endblock inject %} - - {% block close %} - {{ super() }} - {% endblock close %} - -{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/lineplusbarchart.html b/airflow/_vendor/nvd3/templates/lineplusbarchart.html deleted file mode 100644 index 830a192e69afb..0000000000000 --- a/airflow/_vendor/nvd3/templates/lineplusbarchart.html +++ /dev/null @@ -1,43 +0,0 @@ -{# This template adds attributes unique - to linePlusBarChart #} - -{% extends "content.html" %} -{% block body %} - - {% block data %} - {{super()}} - {% endblock data %} - - {% block init %} - {{super()}} - {% endblock init %} - - {% block focus %} - {% if chart.focus_enable %} - chart.focusEnable(true); - {% else %} - chart.focusEnable(false); - {% endif %} - {% endblock focus %} - - {% block axes %} - {{super()}} - {% endblock axes %} - - {% block legend %} - {{super()}} - {% endblock legend %} - - {% block custoattr %} - {{super()}} - {% endblock custoattr %} - - {% block inject %} - {{ super() }} - {% endblock inject %} - - {% block close %} - {{ super() }} - {% endblock close %} - -{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/linewfocuschart.html b/airflow/_vendor/nvd3/templates/linewfocuschart.html deleted file mode 100644 index ef0286767abb2..0000000000000 --- a/airflow/_vendor/nvd3/templates/linewfocuschart.html +++ /dev/null @@ -1,10 +0,0 @@ -{# This template adds attributes unique - to lineWithFocusChart #} - -{% extends "content.html" %} -{% block body %} - -{# calling super guarantees everying in content is also found here ...#} -{{super()}} - -{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/multibarchart.html b/airflow/_vendor/nvd3/templates/multibarchart.html deleted file mode 100644 index 17eae7a634fef..0000000000000 --- a/airflow/_vendor/nvd3/templates/multibarchart.html +++ /dev/null @@ -1,10 +0,0 @@ -{# This template adds attributes unique - to multiBarChart #} - -{% extends "content.html" %} -{% block body %} - -{# calling super guarantees everying in content is also found here ...#} -{{super()}} - -{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/multibarcharthorizontal.html b/airflow/_vendor/nvd3/templates/multibarcharthorizontal.html deleted file mode 100644 index 17eae7a634fef..0000000000000 --- a/airflow/_vendor/nvd3/templates/multibarcharthorizontal.html +++ /dev/null @@ -1,10 +0,0 @@ -{# This template adds attributes unique - to multiBarChart #} - -{% extends "content.html" %} -{% block body %} - -{# calling super guarantees everying in content is also found here ...#} -{{super()}} - -{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/page.html b/airflow/_vendor/nvd3/templates/page.html deleted file mode 100644 index 2dd0f5d16f829..0000000000000 --- a/airflow/_vendor/nvd3/templates/page.html +++ /dev/null @@ -1,12 +0,0 @@ - - - - - {% for header_element in chart.header_css+chart.header_js %} - {{ header_element }} - {% endfor %} - - - {{ chart.content }} - - diff --git a/airflow/_vendor/nvd3/templates/piechart.html b/airflow/_vendor/nvd3/templates/piechart.html deleted file mode 100644 index a200e6d4a21bb..0000000000000 --- a/airflow/_vendor/nvd3/templates/piechart.html +++ /dev/null @@ -1,80 +0,0 @@ -{# This template adds attributes unique - to pieChart #} - -{% extends "content.html" %} -{% block body %} - - data_{{ chart.name }}={{ chart.series_js }}; - - nv.addGraph(function() { - var chart = nv.models.{{ chart.model }}(){% if chart.use_interactive_guideline %}.useInteractiveGuideline(true){% endif %}; - chart.margin({top: {{ chart.margin_top }}, right: {{ chart.margin_right }}, bottom: {{ chart.margin_bottom }}, left: {{ chart.margin_left }}}); - var datum = data_{{ chart.name }}[0].values; - - {% if not chart.color_list and chart.color_category %} - chart.color(d3.scale.{{ chart.color_category }}().range()); - {% endif %} - - chart.tooltipContent(function(key, y, e, graph) { - var x = String(key); - {{ chart.tooltip_condition_string }} - tooltip_str = '
'+x+'
' + y; - return tooltip_str; - }); - {# showLabels only supported in pieChart #} - chart.showLabels({{chart.show_labels|lower}}); - - {% if chart.donut %} - chart.donut(true); - chart.donutRatio({{ chart.donutRatio }}); - {% else %} - chart.donut(false); - {% endif %} - - chart.showLegend({{chart.show_legend|lower}}); - - {# add custom chart attributes #} - {% for attr, value in chart.chart_attr.items() %} - {% if value is string and value.startswith(".") %}: - chart.{{ attr }}{{ value }}; - {% else %} - chart.{{ attr }}({{ value }}); - {% endif %} - {% endfor %} - - {% if chart.resize %} - nv.utils.windowResize(chart.update); - {% endif %} - - {% if chart.color_list %} - var mycolor = new Array(); - {% for color in chart.color_list %} - mycolor[{{ loop.index - 1}}] = "{{ color }}"; - {% endfor %} - {% endif %} - - chart - .x(function(d) { return d.label }) - .y(function(d) { return d.value }); - - {% if chart.width %} - chart.width({{ chart.width }}); - {% endif %} - - {% if chart.height %} - chart.height({{ chart.height }}); - {% endif %} - - {% if chart.color_list %} - chart.color(mycolor); - {% endif %} - - {% block inject %} - {{super()}} - {% endblock inject %} - - {% block close %} - {{ super() }} - {% endblock close %} - -{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/scatterchart.html b/airflow/_vendor/nvd3/templates/scatterchart.html deleted file mode 100644 index 8c2adaae34cee..0000000000000 --- a/airflow/_vendor/nvd3/templates/scatterchart.html +++ /dev/null @@ -1,52 +0,0 @@ -{# This template adds attributes unique - to scatterChart #} - -{% extends "content.html" %} -{% block body %} - - {% block data %} - {{super()}} - {% endblock data %} - - {% block init %} - {{super()}} - {% endblock init %} - - {% block axes %} - {{super()}} - {% endblock axes %} - - {% if chart.x_axis_format == 'AM_PM' %} - function get_am_pm(d){ - if (d > 12) { - d = d - 12; return (String(d) + 'PM'); - } - else { - return (String(d) + 'AM'); - } - }; - {% endif %} - - {% block legend %} - {{super()}} - {% endblock legend %} - - {% block custoattr %} - {{super()}} - {% endblock custoattr %} - - {% block inject %} - - chart - .showDistX(true) - .showDistY(true) - .color(d3.scale.category10().range()); - - {{ super() }} - {% endblock inject %} - - {% block close %} - {{ super() }} - {% endblock close %} - -{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/stackedareachart.html b/airflow/_vendor/nvd3/templates/stackedareachart.html deleted file mode 100644 index b70833d2b385d..0000000000000 --- a/airflow/_vendor/nvd3/templates/stackedareachart.html +++ /dev/null @@ -1,7 +0,0 @@ -{# This is a dummy template, we can use that template to add attributes unique - to stackedareachart #} - -{% extends "content.html" %} -{% block body %} - {{ super() }} -{% endblock body %} diff --git a/airflow/_vendor/nvd3/translator.py b/airflow/_vendor/nvd3/translator.py deleted file mode 100644 index ffde2c2a1cec9..0000000000000 --- a/airflow/_vendor/nvd3/translator.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - - -class Tag(object): - """Tag class""" - - def __init__(self, content=None): - self.content = content - self.attrs = ' '.join(['%s="%s"' % (attr, value) - for attr, value in self.attrs]) - - def __str__(self): - return '<%s%s>\n %s\n' % (self.name, - ' ' + self.attrs if self.attrs else '', - self.content, - self.name) - - -class ScriptTag(Tag): - name = 'script' - attrs = (('type', 'text/javascript'),) - - -class AnonymousFunction(object): - def __init__(self, arguments, content): - self.arguments = arguments - self.content = content - - def __str__(self): - return 'function(%s) { %s }' % (self.arguments, self.content) - - -class Function(object): - - def __init__(self, name): - self.name = name - self._calls = [] - - def __str__(self): - operations = [self.name] - operations.extend(str(call) for call in self._calls) - return '%s' % ('.'.join(operations),) - - def __getattr__(self, attr): - self._calls.append(attr) - return self - - def __call__(self, *args): - if not args: - self._calls[-1] = self._calls[-1] + '()' - else: - arguments = ','.join([str(arg) for arg in args]) - self._calls[-1] = self._calls[-1] + '(%s)' % (arguments,) - return self - - -class Assignment(object): - - def __init__(self, key, value, scoped=True): - self.key = key - self.value = value - self.scoped = scoped - - def __str__(self): - return '%s%s = %s;' % ('var ' if self.scoped else '', self.key, self.value) - - -def indent(func): - # TODO: Add indents to function str - return str(func) diff --git a/airflow/_vendor/slugify/LICENSE b/airflow/_vendor/slugify/LICENSE deleted file mode 100644 index 82af695f594e8..0000000000000 --- a/airflow/_vendor/slugify/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License - -Copyright (c) Val Neekman @ Neekware Inc. http://neekware.com - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/airflow/_vendor/slugify/__init__.py b/airflow/_vendor/slugify/__init__.py deleted file mode 100644 index 7358b998cd543..0000000000000 --- a/airflow/_vendor/slugify/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from .slugify import * - - -__author__ = 'Val Neekman @ Neekware Inc. [@vneekman]' -__description__ = 'A Python slugify application that also handles Unicode' -__version__ = '2.0.1' diff --git a/airflow/_vendor/slugify/slugify.py b/airflow/_vendor/slugify/slugify.py deleted file mode 100644 index 0e9886d827138..0000000000000 --- a/airflow/_vendor/slugify/slugify.py +++ /dev/null @@ -1,185 +0,0 @@ -import re -import unicodedata -import types -import sys - -try: - from htmlentitydefs import name2codepoint - _unicode = unicode - _unicode_type = types.UnicodeType -except ImportError: - from html.entities import name2codepoint - _unicode = str - _unicode_type = str - unichr = chr - -import text_unidecode as unidecode - -__all__ = ['slugify', 'smart_truncate'] - - -CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint)) -DECIMAL_PATTERN = re.compile(r'&#(\d+);') -HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);') -QUOTE_PATTERN = re.compile(r'[\']+') -ALLOWED_CHARS_PATTERN = re.compile(r'[^-a-z0-9]+') -ALLOWED_CHARS_PATTERN_WITH_UPPERCASE = re.compile(r'[^-a-zA-Z0-9]+') -DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}') -NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)') -DEFAULT_SEPARATOR = '-' - - -def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False): - """ - Truncate a string. - :param string (str): string for modification - :param max_length (int): output string length - :param word_boundary (bool): - :param save_order (bool): if True then word order of output string is like input string - :param separator (str): separator between words - :return: - """ - - string = string.strip(separator) - - if not max_length: - return string - - if len(string) < max_length: - return string - - if not word_boundary: - return string[:max_length].strip(separator) - - if separator not in string: - return string[:max_length] - - truncated = '' - for word in string.split(separator): - if word: - next_len = len(truncated) + len(word) - if next_len < max_length: - truncated += '{0}{1}'.format(word, separator) - elif next_len == max_length: - truncated += '{0}'.format(word) - break - else: - if save_order: - break - if not truncated: # pragma: no cover - truncated = string[:max_length] - return truncated.strip(separator) - - -def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, - separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True, - replacements=()): - """ - Make a slug from the given text. - :param text (str): initial text - :param entities (bool): - :param decimal (bool): - :param hexadecimal (bool): - :param max_length (int): output string length - :param word_boundary (bool): - :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order - :param separator (str): separator between words - :param stopwords (iterable): words to discount - :param regex_pattern (str): regex pattern for allowed characters - :param lowercase (bool): activate case sensitivity by setting it to False - :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] - :return (str): - """ - - # user-specific replacements - if replacements: - for old, new in replacements: - text = text.replace(old, new) - - # ensure text is unicode - if not isinstance(text, _unicode_type): - text = _unicode(text, 'utf-8', 'ignore') - - # replace quotes with dashes - pre-process - text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) - - # decode unicode - text = unidecode.unidecode(text) - - # ensure text is still in unicode - if not isinstance(text, _unicode_type): - text = _unicode(text, 'utf-8', 'ignore') - - # character entity reference - if entities: - text = CHAR_ENTITY_PATTERN.sub(lambda m: unichr(name2codepoint[m.group(1)]), text) - - # decimal character reference - if decimal: - try: - text = DECIMAL_PATTERN.sub(lambda m: unichr(int(m.group(1))), text) - except Exception: - pass - - # hexadecimal character reference - if hexadecimal: - try: - text = HEX_PATTERN.sub(lambda m: unichr(int(m.group(1), 16)), text) - except Exception: - pass - - # translate - text = unicodedata.normalize('NFKD', text) - if sys.version_info < (3,): - text = text.encode('ascii', 'ignore') - - # make the text lowercase (optional) - if lowercase: - text = text.lower() - - # remove generated quotes -- post-process - text = QUOTE_PATTERN.sub('', text) - - # cleanup numbers - text = NUMBERS_PATTERN.sub('', text) - - # replace all other unwanted characters - if lowercase: - pattern = regex_pattern or ALLOWED_CHARS_PATTERN - else: - pattern = regex_pattern or ALLOWED_CHARS_PATTERN_WITH_UPPERCASE - text = re.sub(pattern, DEFAULT_SEPARATOR, text) - - # remove redundant - text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) - - # remove stopwords - if stopwords: - if lowercase: - stopwords_lower = [s.lower() for s in stopwords] - words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower] - else: - words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords] - text = DEFAULT_SEPARATOR.join(words) - - # finalize user-specific replacements - if replacements: - for old, new in replacements: - text = text.replace(old, new) - - # smart truncate if requested - if max_length > 0: - text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order) - - if separator != DEFAULT_SEPARATOR: - text = text.replace(DEFAULT_SEPARATOR, separator) - - return text - - -def main(): # pragma: no cover - if len(sys.argv) < 2: - print("Usage %s TEXT TO SLUGIFY" % sys.argv[0]) - else: - text = ' '.join(sys.argv[1:]) - print(slugify(text)) diff --git a/airflow/api/__init__.py b/airflow/api/__init__.py index 2feb7f4ab3c8b..3750752b49295 100644 --- a/airflow/api/__init__.py +++ b/airflow/api/__init__.py @@ -21,15 +21,16 @@ from __future__ import print_function from importlib import import_module +import logging import warnings import lazy_object_proxy from zope.deprecation import deprecated -from airflow.exceptions import AirflowException, AirflowConfigException from airflow.configuration import conf +from airflow.exceptions import AirflowConfigException, AirflowException -from airflow.utils.log.logging_mixin import LoggingMixin +log = logging.getLogger(__name__) class ApiAuth: # pylint: disable=too-few-public-methods @@ -40,8 +41,6 @@ def __init__(self): API_AUTH = ApiAuth() -LOG = LoggingMixin().log - def load_auth(): """Loads authentication backend""" @@ -67,7 +66,7 @@ def load_auth(): api_auth.client_auth = deprecated('use CLIENT_AUTH', api_auth.CLIENT_AUTH) API_AUTH.api_auth = api_auth except ImportError as err: - LOG.critical( + log.critical( "Cannot import %s for API authentication due to: %s", auth_backend, err ) diff --git a/airflow/api/auth/backend/kerberos_auth.py b/airflow/api/auth/backend/kerberos_auth.py index 3e340f86330d0..26f33094ee166 100644 --- a/airflow/api/auth/backend/kerberos_auth.py +++ b/airflow/api/auth/backend/kerberos_auth.py @@ -44,6 +44,7 @@ from future.standard_library import install_aliases +import logging import os from functools import wraps @@ -61,15 +62,14 @@ from requests_kerberos import HTTPKerberosAuth from airflow.configuration import conf -from airflow.utils.log.logging_mixin import LoggingMixin + +log = logging.getLogger(__name__) install_aliases() # pylint: disable=c-extension-no-member CLIENT_AUTH = HTTPKerberosAuth(service='airflow') -LOG = LoggingMixin().log - class KerberosService: # pylint: disable=too-few-public-methods """Class to keep information about the Kerberos Service initialized """ @@ -87,7 +87,7 @@ def init_app(app): hostname = app.config.get('SERVER_NAME') if not hostname: hostname = getfqdn() - LOG.info("Kerberos: hostname %s", hostname) + log.info("Kerberos: hostname %s", hostname) service = 'airflow' @@ -97,12 +97,12 @@ def init_app(app): os.environ['KRB5_KTNAME'] = conf.get('kerberos', 'keytab') try: - LOG.info("Kerberos init: %s %s", service, hostname) + log.info("Kerberos init: %s %s", service, hostname) principal = kerberos.getServerPrincipalDetails(service, hostname) except kerberos.KrbError as err: - LOG.warning("Kerberos: %s", err) + log.warning("Kerberos: %s", err) else: - LOG.info("Kerberos API: server is %s", principal) + log.info("Kerberos API: server is %s", principal) def _unauthorized(): diff --git a/airflow/api/client/json_client.py b/airflow/api/client/json_client.py index 9fa17aee07267..b8992f7207b6a 100644 --- a/airflow/api/client/json_client.py +++ b/airflow/api/client/json_client.py @@ -59,7 +59,7 @@ def trigger_dag(self, dag_id, run_id=None, conf=None, execution_date=None): return data['message'] def delete_dag(self, dag_id): - endpoint = '/api/experimental/dags/{}/delete_dag'.format(dag_id) + endpoint = '/api/experimental/dags/{}'.format(dag_id) url = urljoin(self._api_base_url, endpoint) data = self._request(url, method='DELETE') return data['message'] diff --git a/airflow/api/common/experimental/delete_dag.py b/airflow/api/common/experimental/delete_dag.py index db48d35483df7..c6c10ff62849a 100644 --- a/airflow/api/common/experimental/delete_dag.py +++ b/airflow/api/common/experimental/delete_dag.py @@ -17,6 +17,7 @@ # specific language governing permissions and limitations # under the License. """Delete DAGs APIs.""" +import logging from sqlalchemy import or_ @@ -26,7 +27,8 @@ from airflow.utils.db import provide_session from airflow.exceptions import DagNotFound from airflow.settings import STORE_SERIALIZED_DAGS -from airflow.utils.log.logging_mixin import LoggingMixin + +log = logging.getLogger(__name__) @provide_session @@ -39,8 +41,7 @@ def delete_dag(dag_id, keep_records_in_log=True, session=None): :param session: session used :return count of deleted dags """ - logger = LoggingMixin() - logger.log.info("Deleting DAG: %s", dag_id) + log.info("Deleting DAG: %s", dag_id) dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).first() if dag is None: raise DagNotFound("Dag id {} not found".format(dag_id)) diff --git a/airflow/api/common/experimental/mark_tasks.py b/airflow/api/common/experimental/mark_tasks.py index 9cc48521a2fcf..53b52f3ff451e 100644 --- a/airflow/api/common/experimental/mark_tasks.py +++ b/airflow/api/common/experimental/mark_tasks.py @@ -137,8 +137,7 @@ def all_subdag_tasks_query(sub_dag_run_ids, session, state, confirmed_dates): # filter( TaskInstance.dag_id.in_(sub_dag_run_ids), TaskInstance.execution_date.in_(confirmed_dates) # noqa: E123 - ).\ - filter( + ).filter( # noqa: E123 or_( TaskInstance.state.is_(None), TaskInstance.state != state @@ -154,13 +153,12 @@ def get_all_dag_task_query(dag, session, state, task_ids, confirmed_dates): # n TaskInstance.dag_id == dag.dag_id, TaskInstance.execution_date.in_(confirmed_dates), TaskInstance.task_id.in_(task_ids) # noqa: E123 - ).\ - filter( + ).filter( # noqa: E123 or_( TaskInstance.state.is_(None), TaskInstance.state != state ) - ) + ) # noqa: E123 return qry_dag diff --git a/airflow/api/common/experimental/trigger_dag.py b/airflow/api/common/experimental/trigger_dag.py index e7aad064abdcd..7adfac65dab54 100644 --- a/airflow/api/common/experimental/trigger_dag.py +++ b/airflow/api/common/experimental/trigger_dag.py @@ -85,12 +85,10 @@ def _trigger_dag( else: run_conf = json.loads(conf) - triggers = list() - dags_to_trigger = list() - dags_to_trigger.append(dag) - while dags_to_trigger: - dag = dags_to_trigger.pop() - trigger = dag.create_dagrun( + triggers = [] + dags_to_trigger = [dag] + dag.subdags + for _dag in dags_to_trigger: + trigger = _dag.create_dagrun( run_id=run_id, execution_date=execution_date, state=State.RUNNING, @@ -98,8 +96,6 @@ def _trigger_dag( external_trigger=True, ) triggers.append(trigger) - if dag.subdags: - dags_to_trigger.extend(dag.subdags) return triggers diff --git a/airflow/bin/cli.py b/airflow/bin/cli.py index 82162f27c713f..843282ac4409a 100644 --- a/airflow/bin/cli.py +++ b/airflow/bin/cli.py @@ -20,71 +20,90 @@ from __future__ import print_function import errno +import hashlib import importlib +import itertools +import locale import logging import os +import platform import subprocess import textwrap import random import string +import yaml +from collections import OrderedDict, namedtuple from importlib import import_module import getpass + import reprlib import argparse + +import requests +import tenacity from builtins import input from tempfile import NamedTemporaryFile -from airflow.utils.dot_renderer import render_dag -from airflow.utils.timezone import parse as parsedate import json from tabulate import tabulate import daemon from daemon.pidfile import TimeoutPIDLockFile +import io +import psutil +import re import signal import sys import threading -import traceback import time -import psutil -import re -from urllib.parse import urlunparse -from typing import Any +import traceback + +from typing import Any, cast import airflow from airflow import api from airflow import jobs, settings -from airflow.configuration import conf +from airflow.configuration import conf, get_airflow_home from airflow.exceptions import AirflowException, AirflowWebServerTimeout from airflow.executors import get_default_executor from airflow.models import ( Connection, DagModel, DagBag, DagPickle, TaskInstance, DagRun, Variable, DAG ) from airflow.ti_deps.dep_context import (DepContext, SCHEDULER_QUEUED_DEPS) +from airflow.typing_compat import Protocol from airflow.utils import cli as cli_utils, db +from airflow.utils.dot_renderer import render_dag from airflow.utils.net import get_hostname +from airflow.utils.timezone import parse as parsedate from airflow.utils.log.logging_mixin import (LoggingMixin, redirect_stderr, redirect_stdout) from airflow.www.app import (cached_app, create_app) from airflow.www_rbac.app import cached_app as cached_app_rbac from airflow.www_rbac.app import create_app as create_app_rbac from airflow.www_rbac.app import cached_appbuilder +from airflow.version import version as airflow_version +import pygments +from pygments.formatters.terminal import TerminalFormatter +from pygments.lexers.configs import IniLexer from sqlalchemy.orm import exc import six +from six.moves.urllib_parse import urlunparse, urlsplit, urlunsplit api.load_auth() api_module = import_module(conf.get('cli', 'api_client')) # type: Any api_client = api_module.Client(api_base_url=conf.get('cli', 'endpoint_url'), auth=api.API_AUTH.api_auth.CLIENT_AUTH) -log = LoggingMixin().log +log = logging.getLogger(__name__) DAGS_FOLDER = settings.DAGS_FOLDER -if "BUILDING_AIRFLOW_DOCS" in os.environ: +BUILD_DOCS = "BUILDING_AIRFLOW_DOCS" in os.environ + +if BUILD_DOCS: DAGS_FOLDER = '[AIRFLOW_HOME]/dags' @@ -222,6 +241,7 @@ def backfill(args, dag=None): ) +@cli_utils.deprecated_action(new_name='dags trigger') @cli_utils.action_logging def trigger_dag(args): """ @@ -240,6 +260,7 @@ def trigger_dag(args): raise AirflowException(err) +@cli_utils.deprecated_action(new_name='dags delete') @cli_utils.action_logging def delete_dag(args): """ @@ -260,6 +281,41 @@ def delete_dag(args): print("Bail.") +def _pool_wrapper(args, get=None, set=None, delete=None, export=None, imp=None): + args.get = get + args.set = set + args.delete = delete + args.export = export + setattr(args, 'import', imp) + pool(args) + + +def pool_list(args): + _pool_wrapper(args) + + +def pool_get(args): + _pool_wrapper(args, get=args.pool) + + +def pool_set(args): + _pool_wrapper(args, set=(args.name, args.slots, args.description)) + + +def pool_delete(args): + _pool_wrapper(args, delete=pool.name) + + +def pool_import(args): + _pool_wrapper(args, imp=args.file) + + +def pool_export(args): + _pool_wrapper(args, export=args.file) + + +@cli_utils.deprecated_action(new_name=['pools list', 'pools get', 'pools set', 'pools delete', 'pools import', + 'pools export']) @cli_utils.action_logging def pool(args): def _tabulate(pools): @@ -329,6 +385,40 @@ def pool_export_helper(filepath): return pools +def _vars_wrapper(args, get=None, set=None, delete=None, export=None, imp=None): + args.get = get + args.set = set + args.delete = delete + args.export = export + setattr(args, 'import', imp) + variables(args) + + +def variables_list(args): + _vars_wrapper(args) + + +def variables_get(args): + _vars_wrapper(args, get=args.key) + + +def variables_delete(args): + _vars_wrapper(args, delete=args.key) + + +def variables_set(args): + _vars_wrapper(args, set=args.key) + + +def variables_import(args): + _vars_wrapper(args, imp=args.file) + + +def variables_export(args): + _vars_wrapper(args, export=args.file) + + +@cli_utils.deprecated_action(new_name='variables list') @cli_utils.action_logging def variables(args): if args.get: @@ -401,11 +491,13 @@ def export_helper(filepath): print("{} variables successfully exported to {}".format(len(var_dict), filepath)) +@cli_utils.deprecated_action(new_name='dags pause') @cli_utils.action_logging def pause(args): set_is_paused(True, args) +@cli_utils.deprecated_action(new_name='dags unpause') @cli_utils.action_logging def unpause(args): set_is_paused(False, args) @@ -419,6 +511,7 @@ def set_is_paused(is_paused, args): print("Dag: {}, paused: {}".format(args.dag_id, str(is_paused))) +@cli_utils.deprecated_action(new_name='dags show') def show_dag(args): dag = get_dag(args) dot = render_dag(dag) @@ -497,6 +590,8 @@ def _run(args, dag, ti): executor.end() +# Don't warn on deprecation on this one. It is deprecated, but it is used almost exclusively internally, and +# by not warning we have to make a smaller code change. @cli_utils.action_logging def run(args, dag=None): if dag: @@ -523,7 +618,7 @@ def run(args, dag=None): dag = get_dag(args) elif not dag: with db.create_session() as session: - print('Loading pickle id %s', args.pickle) + print('Loading pickle id ', args.pickle) dag_pickle = session.query(DagPickle).filter(DagPickle.id == args.pickle).first() if not dag_pickle: raise AirflowException("Who hid the pickle!? [missing pickle]") @@ -536,16 +631,44 @@ def run(args, dag=None): ti.init_run_context(raw=args.raw) hostname = get_hostname() - print("Running %s on host %s", ti, hostname) + print("Running {} on host {}".format(ti, hostname)) if args.interactive: _run(args, dag, ti) else: - with redirect_stdout(ti.log, logging.INFO), redirect_stderr(ti.log, logging.WARN): - _run(args, dag, ti) + if settings.DONOT_MODIFY_HANDLERS: + with redirect_stdout(ti.log, logging.INFO), redirect_stderr(ti.log, logging.WARN): + _run(args, dag, ti) + else: + # Get all the Handlers from 'airflow.task' logger + # Add these handlers to the root logger so that we can get logs from + # any custom loggers defined in the DAG + airflow_logger_handlers = logging.getLogger('airflow.task').handlers + root_logger = logging.getLogger() + root_logger_handlers = root_logger.handlers + + # Remove all handlers from Root Logger to avoid duplicate logs + for handler in root_logger_handlers: + root_logger.removeHandler(handler) + + for handler in airflow_logger_handlers: + root_logger.addHandler(handler) + root_logger.setLevel(logging.getLogger('airflow.task').level) + + with redirect_stdout(ti.log, logging.INFO), redirect_stderr(ti.log, logging.WARN): + _run(args, dag, ti) + + # We need to restore the handlers to the loggers as celery worker process + # can call this command multiple times, + # so if we don't reset this then logs from next task would go to the wrong place + for handler in airflow_logger_handlers: + root_logger.removeHandler(handler) + for handler in root_logger_handlers: + root_logger.addHandler(handler) logging.shutdown() +@cli_utils.deprecated_action(new_name='tasks failed-deps') @cli_utils.action_logging def task_failed_deps(args): """ @@ -573,6 +696,7 @@ def task_failed_deps(args): print("Task instance dependencies are all met.") +@cli_utils.deprecated_action(new_name='tasks state') @cli_utils.action_logging def task_state(args): """ @@ -586,6 +710,7 @@ def task_state(args): print(ti.current_state()) +@cli_utils.deprecated_action(new_name='dags state') @cli_utils.action_logging def dag_state(args): """ @@ -598,6 +723,7 @@ def dag_state(args): print(dr[0].state if len(dr) > 0 else None) +@cli_utils.deprecated_action(new_name='dags next-execution') @cli_utils.action_logging def next_execution(args): """ @@ -623,6 +749,7 @@ def next_execution(args): print(None) +@cli_utils.deprecated_action(new_name='rotate-fernet-key') @cli_utils.action_logging def rotate_fernet_key(args): session = settings.Session() @@ -634,6 +761,7 @@ def rotate_fernet_key(args): session.commit() +@cli_utils.deprecated_action(new_name=['dags list', 'dags report']) @cli_utils.action_logging def list_dags(args): dagbag = DagBag(process_subdir(args.subdir)) @@ -645,10 +773,17 @@ def list_dags(args): """) dag_list = "\n".join(sorted(dagbag.dags)) print(s.format(dag_list=dag_list)) - if args.report: + if getattr(args, 'report', False): print(dagbag.dagbag_report()) +def list_dags_report(args): + args.report = True + args.deprecation_warning = False + list_dags(args) + + +@cli_utils.deprecated_action(new_name='tasks list') @cli_utils.action_logging def list_tasks(args, dag=None): dag = dag or get_dag(args) @@ -659,6 +794,7 @@ def list_tasks(args, dag=None): print("\n".join(sorted(tasks))) +@cli_utils.deprecated_action(new_name='tasks test') @cli_utils.action_logging def test(args, dag=None): # We want log outout from operators etc to show up here. Normally @@ -695,6 +831,7 @@ def test(args, dag=None): logging.getLogger('airflow.task').propagate = False +@cli_utils.deprecated_action(new_name='tasks render') @cli_utils.action_logging def render(args): dag = get_dag(args) @@ -710,6 +847,7 @@ def render(args): """.format(attr, getattr(task, attr)))) +@cli_utils.deprecated_action(new_name='tasks clear') @cli_utils.action_logging def clear(args): logging.basicConfig( @@ -723,7 +861,8 @@ def clear(args): task_regex=args.task_regex, include_downstream=args.downstream, include_upstream=args.upstream) - + if args.yes: + args.no_confirm = args.yes DAG.clear_dags( dags, start_date=args.start_date, @@ -736,31 +875,11 @@ def clear(args): ) -def get_num_ready_workers_running(gunicorn_master_proc): - workers = psutil.Process(gunicorn_master_proc.pid).children() - - def ready_prefix_on_cmdline(proc): - try: - cmdline = proc.cmdline() - if len(cmdline) > 0: - return settings.GUNICORN_WORKER_READY_PREFIX in cmdline[0] - except psutil.NoSuchProcess: - pass - return False - - ready_workers = [proc for proc in workers if ready_prefix_on_cmdline(proc)] - return len(ready_workers) - - -def get_num_workers_running(gunicorn_master_proc): - workers = psutil.Process(gunicorn_master_proc.pid).children() - return len(workers) - - -def restart_workers(gunicorn_master_proc, num_workers_expected, master_timeout): +class GunicornMonitor(LoggingMixin): """ Runs forever, monitoring the child processes of @gunicorn_master_proc and - restarting workers occasionally. + restarting workers occasionally or when files in the plug-in directory + has been modified. Each iteration of the loop traverses one edge of this state transition diagram, where each state (node) represents [ num_ready_workers_running / num_workers_running ]. We expect most time to @@ -777,92 +896,246 @@ def restart_workers(gunicorn_master_proc, num_workers_expected, master_timeout): master process, which increases and decreases the number of child workers respectively. Gunicorn guarantees that on TTOU workers are terminated gracefully and that the oldest worker is terminated. + + :param gunicorn_master_pid: pid of the main Gunicorn process + :param num_workers_expected: Number of workers to run the Gunicorn web server + :param master_timeout: Number of seconds the webserver waits before killing gunicorn master that + doesn't respond + :param worker_refresh_interval: Number of seconds to wait before refreshing a batch of workers. + :param worker_refresh_batch_size: Number of workers to refresh at a time. When set to 0, worker + refresh is disabled. When nonzero, airflow periodically refreshes webserver workers by + bringing up new ones and killing old ones. + :param reload_on_plugin_change: If set to True, Airflow will track files in plugins_follder directory. + When it detects changes, then reload the gunicorn. """ + def __init__( + self, + gunicorn_master_pid, + num_workers_expected, + master_timeout, + worker_refresh_interval, + worker_refresh_batch_size, + reload_on_plugin_change + ): + super(GunicornMonitor, self).__init__() + self.gunicorn_master_proc = psutil.Process(gunicorn_master_pid) + self.num_workers_expected = num_workers_expected + self.master_timeout = master_timeout + self.worker_refresh_interval = worker_refresh_interval + self.worker_refresh_batch_size = worker_refresh_batch_size + self.reload_on_plugin_change = reload_on_plugin_change + + self._num_workers_running = 0 + self._num_ready_workers_running = 0 + self._last_refresh_time = time.time() if worker_refresh_interval > 0 else None + self._last_plugin_state = self._generate_plugin_state() if reload_on_plugin_change else None + self._restart_on_next_plugin_check = False + + def _generate_plugin_state(self): + """ + Generate dict of filenames and last modification time of all files in settings.PLUGINS_FOLDER + directory. + """ + if not settings.PLUGINS_FOLDER: + return {} + + all_filenames = [] + for (root, _, filenames) in os.walk(settings.PLUGINS_FOLDER): + all_filenames.extend(os.path.join(root, f) for f in filenames) + plugin_state = {f: self._get_file_hash(f) for f in sorted(all_filenames)} + return plugin_state + + @staticmethod + def _get_file_hash(fname): + """Calculate MD5 hash for file""" + hash_md5 = hashlib.md5() + with open(fname, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + def _get_num_ready_workers_running(self): + """Returns number of ready Gunicorn workers by looking for READY_PREFIX in process name""" + workers = psutil.Process(self.gunicorn_master_proc.pid).children() + + def ready_prefix_on_cmdline(proc): + try: + cmdline = proc.cmdline() + if len(cmdline) > 0: # pylint: disable=len-as-condition + return settings.GUNICORN_WORKER_READY_PREFIX in cmdline[0] + except psutil.NoSuchProcess: + pass + return False + + ready_workers = [proc for proc in workers if ready_prefix_on_cmdline(proc)] + return len(ready_workers) + + def _get_num_workers_running(self): + """Returns number of running Gunicorn workers processes""" + workers = psutil.Process(self.gunicorn_master_proc.pid).children() + return len(workers) - def wait_until_true(fn, timeout=0): + def _wait_until_true(self, fn, timeout=0): """ Sleeps until fn is true """ - t = time.time() + start_time = time.time() while not fn(): - if 0 < timeout and timeout <= time.time() - t: + if 0 < timeout <= time.time() - start_time: raise AirflowWebServerTimeout( - "No response from gunicorn master within {0} seconds" - .format(timeout)) + "No response from gunicorn master within {0} seconds".format(timeout) + ) time.sleep(0.1) - def start_refresh(gunicorn_master_proc): - batch_size = conf.getint('webserver', 'worker_refresh_batch_size') - log.debug('%s doing a refresh of %s workers', state, batch_size) - sys.stdout.flush() - sys.stderr.flush() - + def _spawn_new_workers(self, count): + """ + Send signal to kill the worker. + :param count: The number of workers to spawn + """ excess = 0 - for _ in range(batch_size): - gunicorn_master_proc.send_signal(signal.SIGTTIN) + for _ in range(count): + # TTIN: Increment the number of processes by one + self.gunicorn_master_proc.send_signal(signal.SIGTTIN) excess += 1 - wait_until_true(lambda: num_workers_expected + excess == - get_num_workers_running(gunicorn_master_proc), - master_timeout) + self._wait_until_true( + lambda: self.num_workers_expected + excess == self._get_num_workers_running(), + timeout=self.master_timeout + ) - try: - wait_until_true(lambda: num_workers_expected == - get_num_workers_running(gunicorn_master_proc), - master_timeout) - while True: - num_workers_running = get_num_workers_running(gunicorn_master_proc) - num_ready_workers_running = \ - get_num_ready_workers_running(gunicorn_master_proc) - - state = '[{0} / {1}]'.format(num_ready_workers_running, num_workers_running) - - # Whenever some workers are not ready, wait until all workers are ready - if num_ready_workers_running < num_workers_running: - log.debug('%s some workers are starting up, waiting...', state) - sys.stdout.flush() + def _kill_old_workers(self, count): + """ + Send signal to kill the worker. + :param count: The number of workers to kill + """ + for _ in range(count): + count -= 1 + # TTOU: Decrement the number of processes by one + self.gunicorn_master_proc.send_signal(signal.SIGTTOU) + self._wait_until_true( + lambda: self.num_workers_expected + count == self._get_num_workers_running(), + timeout=self.master_timeout) + + def _reload_gunicorn(self): + """ + Send signal to reload the gunciron configuration. When gunciorn receive signals, it reload the + configuration, start the new worker processes with a new configuration and gracefully + shutdown older workers. + """ + # HUP: Reload the configuration. + self.gunicorn_master_proc.send_signal(signal.SIGHUP) + time.sleep(1) + self._wait_until_true( + lambda: self.num_workers_expected == self._get_num_workers_running(), + timeout=self.master_timeout + ) + + def start(self): + """ + Starts monitoring the webserver. + """ + self.log.debug("Start monitoring gunicorn") + try: # pylint: disable=too-many-nested-blocks + self._wait_until_true( + lambda: self.num_workers_expected == self._get_num_workers_running(), + timeout=self.master_timeout + ) + while True: + if not self.gunicorn_master_proc.is_running(): + sys.exit(1) + self._check_workers() + # Throttle loop time.sleep(1) - # Kill a worker gracefully by asking gunicorn to reduce number of workers - elif num_workers_running > num_workers_expected: - excess = num_workers_running - num_workers_expected - log.debug('%s killing %s workers', state, excess) - - for _ in range(excess): - gunicorn_master_proc.send_signal(signal.SIGTTOU) - excess -= 1 - wait_until_true(lambda: num_workers_expected + excess == - get_num_workers_running(gunicorn_master_proc), - master_timeout) - - # Start a new worker by asking gunicorn to increase number of workers - elif num_workers_running == num_workers_expected: - refresh_interval = conf.getint('webserver', 'worker_refresh_interval') - log.debug( - '%s sleeping for %ss starting doing a refresh...', - state, refresh_interval + except (AirflowWebServerTimeout, OSError) as err: + self.log.error(err) + self.log.error("Shutting down webserver") + try: + self.gunicorn_master_proc.terminate() + self.gunicorn_master_proc.wait() + finally: + sys.exit(1) + + def _check_workers(self): + num_workers_running = self._get_num_workers_running() + num_ready_workers_running = self._get_num_ready_workers_running() + + # Whenever some workers are not ready, wait until all workers are ready + if num_ready_workers_running < num_workers_running: + self.log.debug( + '[%d / %d] Some workers are starting up, waiting...', + num_ready_workers_running, num_workers_running + ) + time.sleep(1) + return + + # If there are too many workers, then kill a worker gracefully by asking gunicorn to reduce + # number of workers + if num_workers_running > self.num_workers_expected: + excess = min(num_workers_running - self.num_workers_expected, self.worker_refresh_batch_size) + self.log.debug( + '[%d / %d] Killing %s workers', num_ready_workers_running, num_workers_running, excess + ) + self._kill_old_workers(excess) + return + + # If there are too few workers, start a new worker by asking gunicorn + # to increase number of workers + if num_workers_running < self.num_workers_expected: + self.log.error( + "[%d / %d] Some workers seem to have died and gunicorn did not restart " + "them as expected", + num_ready_workers_running, num_workers_running + ) + time.sleep(10) + num_workers_running = self._get_num_workers_running() + if num_workers_running < self.num_workers_expected: + new_worker_count = min( + self.num_workers_expected - num_workers_running, self.worker_refresh_batch_size ) - time.sleep(refresh_interval) - start_refresh(gunicorn_master_proc) + self.log.info( + '[%d / %d] Spawning %d workers', + num_ready_workers_running, num_workers_running, new_worker_count + ) + self._spawn_new_workers(new_worker_count) + return - else: - # num_ready_workers_running == num_workers_running < num_workers_expected - log.error(( - "%s some workers seem to have died and gunicorn" - "did not restart them as expected" - ), state) - time.sleep(10) - if len( - psutil.Process(gunicorn_master_proc.pid).children() - ) < num_workers_expected: - start_refresh(gunicorn_master_proc) - except (AirflowWebServerTimeout, OSError) as err: - log.error(err) - log.error("Shutting down webserver") - try: - gunicorn_master_proc.terminate() - gunicorn_master_proc.wait() - finally: - sys.exit(1) + # Now the number of running and expected worker should be equal + + # If workers should be restarted periodically. + if self.worker_refresh_interval > 0 and self._last_refresh_time: + # and we refreshed the workers a long time ago, refresh the workers + last_refresh_diff = (time.time() - self._last_refresh_time) + if self.worker_refresh_interval < last_refresh_diff: + num_new_workers = self.worker_refresh_batch_size + self.log.debug( + '[%d / %d] Starting doing a refresh. Starting %d workers.', + num_ready_workers_running, num_workers_running, num_new_workers + ) + self._spawn_new_workers(num_new_workers) + self._last_refresh_time = time.time() + return + + # if we should check the directory with the plugin, + if self.reload_on_plugin_change: + # compare the previous and current contents of the directory + new_state = self._generate_plugin_state() + # If changed, wait until its content is fully saved. + if new_state != self._last_plugin_state: + self.log.debug( + '[%d / %d] Plugins folder changed. The gunicorn will be restarted the next time the ' + 'plugin directory is checked, if there is no change in it.', + num_ready_workers_running, num_workers_running + ) + self._restart_on_next_plugin_check = True + self._last_plugin_state = new_state + elif self._restart_on_next_plugin_check: + self.log.debug( + '[%d / %d] Starts reloading the gunicorn configuration.', + num_ready_workers_running, num_workers_running + ) + self._restart_on_next_plugin_check = False + self._last_refresh_time = time.time() + self._reload_gunicorn() @cli_utils.action_logging @@ -870,6 +1143,17 @@ def webserver(args): py2_deprecation_waring() print(settings.HEADER) + # Check for old/insecure config, and fail safe (i.e. don't launch) if the config is wildly insecure. + if conf.get('webserver', 'secret_key') == 'temporary_key': + print( + "ERROR: The `secret_key` setting under the webserver config has an insecure " + "value - Airflow has failed safe and refuses to start. Please change this value to a new, " + "per-environment, randomly generated string, for example using this command `openssl rand " + "-hex 30`", + file=sys.stderr, + ) + sys.exit(1) + access_logfile = args.access_logfile or conf.get('webserver', 'access_logfile') error_logfile = args.error_logfile or conf.get('webserver', 'error_logfile') num_workers = args.workers or conf.get('webserver', 'workers') @@ -921,13 +1205,13 @@ def webserver(args): run_args = [ 'gunicorn', - '-w', str(num_workers), - '-k', str(args.workerclass), - '-t', str(worker_timeout), - '-b', args.hostname + ':' + str(args.port), - '-n', 'airflow-webserver', - '-p', str(pid), - '-c', 'python:airflow.www.gunicorn_config', + '--workers', str(num_workers), + '--worker-class', str(args.workerclass), + '--timeout', str(worker_timeout), + '--bind', args.hostname + ':' + str(args.port), + '--name', 'airflow-webserver', + '--pid', str(pid), + '--config', 'python:airflow.www.gunicorn_config', ] if args.access_logfile: @@ -937,7 +1221,7 @@ def webserver(args): run_args += ['--error-logfile', str(args.error_logfile)] if args.daemon: - run_args += ['-D'] + run_args += ['--daemon'] if ssl_cert: run_args += ['--certfile', ssl_cert, '--keyfile', ssl_key] @@ -947,19 +1231,24 @@ def webserver(args): gunicorn_master_proc = None - def kill_proc(dummy_signum, dummy_frame): + def kill_proc(signum, _): + log.info("Received signal: %s. Closing gunicorn.", signum) gunicorn_master_proc.terminate() gunicorn_master_proc.wait() sys.exit(0) - def monitor_gunicorn(gunicorn_master_proc): + def monitor_gunicorn(gunicorn_master_pid): # These run forever until SIG{INT, TERM, KILL, ...} signal is sent - if conf.getint('webserver', 'worker_refresh_interval') > 0: - master_timeout = conf.getint('webserver', 'web_server_master_timeout') - restart_workers(gunicorn_master_proc, num_workers, master_timeout) - else: - while True: - time.sleep(1) + GunicornMonitor( + gunicorn_master_pid=gunicorn_master_pid, + num_workers_expected=num_workers, + master_timeout=conf.getint('webserver', 'web_server_master_timeout'), + worker_refresh_interval=conf.getint('webserver', 'worker_refresh_interval', fallback=30), + worker_refresh_batch_size=conf.getint('webserver', 'worker_refresh_batch_size', fallback=1), + reload_on_plugin_change=conf.getboolean( + 'webserver', 'reload_on_plugin_change', fallback=False + ), + ).start() if args.daemon: base, ext = os.path.splitext(pid) @@ -988,7 +1277,7 @@ def monitor_gunicorn(gunicorn_master_proc): time.sleep(0.1) gunicorn_master_proc = psutil.Process(gunicorn_master_proc_pid) - monitor_gunicorn(gunicorn_master_proc) + monitor_gunicorn(gunicorn_master_proc.pid) stdout.close() stderr.close() @@ -998,7 +1287,7 @@ def monitor_gunicorn(gunicorn_master_proc): signal.signal(signal.SIGINT, kill_proc) signal.signal(signal.SIGTERM, kill_proc) - monitor_gunicorn(gunicorn_master_proc) + monitor_gunicorn(gunicorn_master_proc.pid) @cli_utils.action_logging @@ -1067,6 +1356,200 @@ def _serve_logs(env, skip_serve_logs=False): return None +@cli_utils.deprecated_action(new_name='kubernetes generate-dag-yaml') +@cli_utils.action_logging +def kubernetes_generate_dag_yaml(args): + from airflow.executors.kubernetes_executor import AirflowKubernetesScheduler, KubeConfig + from airflow.kubernetes.pod_generator import PodGenerator + from airflow.kubernetes.pod_launcher import PodLauncher + from airflow.kubernetes.worker_configuration import WorkerConfiguration + from kubernetes.client.api_client import ApiClient + dag = get_dag(args) + yaml_output_path = args.output_path + kube_config = KubeConfig() + for task in dag.tasks: + ti = TaskInstance(task, args.execution_date) + pod = PodGenerator.construct_pod( + dag_id=args.dag_id, + task_id=ti.task_id, + pod_id=AirflowKubernetesScheduler._create_pod_id( # pylint: disable=W0212 + args.dag_id, ti.task_id), + try_number=ti.try_number, + date=ti.execution_date, + command=ti.command_as_list(), + kube_image=kube_config.kube_image, + pod_override_object=PodGenerator.from_obj(ti.executor_config), + worker_uuid="worker-config", + namespace=kube_config.executor_namespace, + base_worker_pod=WorkerConfiguration(kube_config=kube_config).as_pod() + ) + api_client = ApiClient() + pod = PodLauncher._mutate_pod_backcompat(pod) + date_string = AirflowKubernetesScheduler._datetime_to_label_safe_datestring( # pylint: disable=W0212 + args.execution_date) + yaml_file_name = "{}_{}_{}.yml".format(args.dag_id, ti.task_id, date_string) + os.makedirs(os.path.dirname(yaml_output_path + "/airflow_yaml_output/"), exist_ok=True) + with open(yaml_output_path + "/airflow_yaml_output/" + yaml_file_name, "w") as output: + sanitized_pod = api_client.sanitize_for_serialization(pod) + output.write(yaml.dump(sanitized_pod)) + print("YAML output can be found at {}/airflow_yaml_output/".format(yaml_output_path)) + + +@cli_utils.action_logging +def generate_pod_template(args): + from airflow.executors.kubernetes_executor import KubeConfig + from airflow.kubernetes.worker_configuration import WorkerConfiguration + from kubernetes.client.api_client import ApiClient + kube_config = KubeConfig() + worker_configuration_pod = WorkerConfiguration(kube_config=kube_config).as_pod() + api_client = ApiClient() + yaml_file_name = "airflow_template.yml" + yaml_output_path = args.output_path + if not os.path.exists(yaml_output_path): + os.makedirs(yaml_output_path) + with open(yaml_output_path + "/" + yaml_file_name, "w") as output: + sanitized_pod = api_client.sanitize_for_serialization(worker_configuration_pod) + sanitized_pod = json.dumps(sanitized_pod) + sanitized_pod = json.loads(sanitized_pod) + output.write(yaml.safe_dump(sanitized_pod)) + output_string = """ +Congratulations on migrating your kubernetes configs to the pod_template_file! + +This is a critical first step on your migration to Airflow 2.0. + +Please check the following file and ensure that all configurations are correct: {yaml_file_name} + + +Please place this file in a desired location on your machine and set the following airflow.cfg config: + +``` +[kubernetes] + pod_template_file=/path/to/{yaml_file_name} +``` + +You will now have full access to the Kubernetes API. + +Please note that the following configs will no longer be considered when the pod_template_file is set: + +worker_container_image_pull_policy +airflow_configmap +airflow_local_settings_configmap +dags_in_image +dags_volume_subpath +dags_volume_mount_point +dags_volume_claim +logs_volume_subpath +logs_volume_claim +dags_volume_host +logs_volume_host +env_from_configmap_ref +env_from_secret_ref +git_repo +git_branch +git_sync_depth +git_subpath +git_sync_rev +git_user +git_password +git_sync_root +git_sync_dest +git_dags_folder_mount_point +git_ssh_key_secret_name +git_ssh_known_hosts_configmap_name +git_sync_credentials_secret +git_sync_container_repository +git_sync_container_tag +git_sync_init_container_name +git_sync_run_as_user +worker_service_account_name +image_pull_secrets +gcp_service_account_keys +affinity +tolerations +run_as_user +fs_group +[kubernetes_node_selectors] +[kubernetes_annotations] +[kubernetes_environment_variables] +[kubernetes_secrets] +[kubernetes_labels] + + +Happy Airflowing! + +""".format(yaml_file_name=yaml_file_name) + print(output_string) + + +@cli_utils.action_logging +def cleanup_pods(args): + from kubernetes.client.rest import ApiException + + from airflow.kubernetes.kube_client import get_kube_client + + """Clean up k8s pods in evicted/failed/succeeded states""" + namespace = args.namespace + + # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/ + # All Containers in the Pod have terminated in success, and will not be restarted. + pod_succeeded = 'succeeded' + + # All Containers in the Pod have terminated, and at least one Container has terminated in failure. + # That is, the Container either exited with non-zero status or was terminated by the system. + pod_failed = 'failed' + + # https://kubernetes.io/docs/tasks/administer-cluster/out-of-resource/ + pod_reason_evicted = 'evicted' + # If pod is failed and restartPolicy is: + # * Always: Restart Container; Pod phase stays Running. + # * OnFailure: Restart Container; Pod phase stays Running. + # * Never: Pod phase becomes Failed. + pod_restart_policy_never = 'never' + + print('Loading Kubernetes configuration') + kube_client = get_kube_client() + print('Listing pods in namespace {}'.format(namespace)) + continue_token = None + while True: # pylint: disable=too-many-nested-blocks + pod_list = kube_client.list_namespaced_pod(namespace=namespace, limit=500, _continue=continue_token) + for pod in pod_list.items: + pod_name = pod.metadata.name + print('Inspecting pod {}'.format(pod_name)) + pod_phase = pod.status.phase.lower() + pod_reason = pod.status.reason.lower() if pod.status.reason else '' + pod_restart_policy = pod.spec.restart_policy.lower() + + if ( + pod_phase == pod_succeeded + or (pod_phase == pod_failed and pod_restart_policy == pod_restart_policy_never) + or (pod_reason == pod_reason_evicted) + ): + print('Deleting pod "{}" phase "{}" and reason "{}", restart policy "{}"'.format( + pod_name, pod_phase, pod_reason, pod_restart_policy) + ) + try: + _delete_pod(pod.metadata.name, namespace) + except ApiException as e: + print("can't remove POD: {}".format(e), file=sys.stderr) + continue + print('No action taken on pod {}'.format(pod_name)) + continue_token = pod_list.metadata._continue # pylint: disable=protected-access + if not continue_token: + break + + +def _delete_pod(name, namespace): + """Helper Function for cleanup_pods""" + from kubernetes import client + + core_v1 = client.CoreV1Api() + delete_options = client.V1DeleteOptions() + print('Deleting POD "{}" from "{}" namespace'.format(name, namespace)) + api_response = core_v1.delete_namespaced_pod(name=name, namespace=namespace, body=delete_options) + print(api_response) + + +@cli_utils.deprecated_action(new_name='celery worker') @cli_utils.action_logging def worker(args): env = os.environ.copy() @@ -1078,6 +1561,7 @@ def worker(args): # Celery worker from airflow.executors.celery_executor import app as celery_app + from celery import maybe_patch_concurrency from celery.bin import worker autoscale = args.autoscale @@ -1098,7 +1582,14 @@ def worker(args): } if conf.has_option("celery", "pool"): - options["pool"] = conf.get("celery", "pool") + pool = conf.get("celery", "pool") + options["pool"] = pool + # Celery pools of type eventlet and gevent use greenlets, which + # requires monkey patching the app: + # https://eventlet.net/doc/patching.html#monkey-patch + # Otherwise task instances hang on the workers and are never + # executed. + maybe_patch_concurrency(['-P', pool]) if args.daemon: pid, stdout, stderr, log_file = setup_locations("worker", @@ -1134,6 +1625,7 @@ def worker(args): sp.kill() +@cli_utils.deprecated_action(new_name='db init') def initdb(args): # noqa py2_deprecation_waring() print("DB: " + repr(settings.engine.url)) @@ -1141,6 +1633,7 @@ def initdb(args): # noqa print("Done.") +@cli_utils.deprecated_action(new_name='db reset') def resetdb(args): py2_deprecation_waring() print("DB: " + repr(settings.engine.url)) @@ -1152,6 +1645,7 @@ def resetdb(args): print("Bail.") +@cli_utils.deprecated_action(new_name='db shell') @cli_utils.action_logging def shell(args): """Run a shell that allows to access database access""" @@ -1193,6 +1687,7 @@ def upgradedb(args): # noqa db.upgradedb() +@cli_utils.deprecated_action(new_name='db check') @cli_utils.action_logging def checkdb(args): # noqa py2_deprecation_waring() @@ -1209,6 +1704,26 @@ def version(args): # noqa 'conn_login', 'conn_password', 'conn_schema', 'conn_port'] +def _conn_wrapper(args, list=None, delete=None, add=None): + args.list = list + args.delete = delete + args.add = add + connections(args) + + +def connections_list(args): + _conn_wrapper(args, list=True) + + +def connections_add(args): + _conn_wrapper(args, add=True) + + +def connections_delete(args): + _conn_wrapper(args, delete=True) + + +@cli_utils.deprecated_action(sub_commands=True) @cli_utils.action_logging def connections(args): if args.list: @@ -1342,6 +1857,7 @@ def connections(args): return +@cli_utils.deprecated_action(new_name='celery flower') @cli_utils.action_logging def flower(args): broka = conf.get('celery', 'BROKER_URL') @@ -1421,6 +1937,7 @@ def kerberos(args): # noqa airflow.security.kerberos.run(principal=args.principal, keytab=args.keytab) +@cli_utils.deprecated_action(new_name='users create') @cli_utils.action_logging def create_user(args): fields = { @@ -1461,6 +1978,7 @@ def create_user(args): raise SystemExit('Failed to create user.') +@cli_utils.deprecated_action(new_name='users delete') @cli_utils.action_logging def delete_user(args): if not args.username: @@ -1479,6 +1997,7 @@ def delete_user(args): raise SystemExit('Failed to delete user.') +@cli_utils.deprecated_action(new_name='users list') @cli_utils.action_logging def list_users(args): appbuilder = cached_appbuilder() @@ -1492,6 +2011,7 @@ def list_users(args): print(msg) +@cli_utils.deprecated_action(new_name='dags list-runs') @cli_utils.action_logging def list_dag_runs(args, dag=None): if dag: @@ -1544,14 +2064,16 @@ def list_dag_runs(args, dag=None): print(record) +@cli_utils.deprecated_action(new_name='sync-perm') @cli_utils.action_logging def sync_perm(args): # noqa if settings.RBAC: appbuilder = cached_appbuilder() print('Updating permission, view-menu for all existing roles') appbuilder.sm.sync_roles() + appbuilder.add_permissions(update_perms=True) print('Updating permission on all DAG views') - dags = DagBag().dags.values() + dags = DagBag(store_serialized_dags=settings.STORE_SERIALIZED_DAGS).dags.values() for dag in dags: appbuilder.sm.sync_perm_for_dag( dag.dag_id, @@ -1560,17 +2082,1367 @@ def sync_perm(args): # noqa print('The sync_perm command only works for rbac UI.') -class Arg(object): - def __init__(self, flags=None, help=None, action=None, default=None, nargs=None, - type=None, choices=None, metavar=None): +@cli_utils.deprecated_action(new_name='config list') +def config(args): + """Show current application configuration""" + with io.StringIO() as output: + conf.write(output) + code = output.getvalue() + if cli_utils.should_use_colors(args): + code = pygments.highlight( + code=code, formatter=TerminalFormatter(), lexer=IniLexer() + ) + print(code) + + +class Anonymizer(Protocol): + """Anonymizer protocol.""" + + def process_path(self, value): + """Remove pii from paths""" + + def process_username(self, value): + """Remove pii from ussername""" + + def process_url(self, value): + """Remove pii from URL""" + + +class NullAnonymizer(Anonymizer): + """Do nothing.""" + + def _identity(self, value): + return value + + process_path = process_username = process_url = _identity + + del _identity + + +class PiiAnonymizer(Anonymizer): + """Remove personally identifiable info from path.""" + + def __init__(self): + home_path = os.path.expanduser("~") + username = getpass.getuser() + self._path_replacements = OrderedDict([ + (home_path, "${HOME}"), (username, "${USER}") + ]) + + def process_path(self, value): + if not value: + return value + for src, target in self._path_replacements.items(): + value = value.replace(src, target) + return value + + def process_username(self, value): + if not value: + return value + return value[0] + "..." + value[-1] + + def process_url(self, value): + if not value: + return value + + url_parts = urlsplit(value) + netloc = None + if url_parts.netloc: + # unpack + userinfo = None + host = None + username = None + password = None + + if "@" in url_parts.netloc: + userinfo, _, host = url_parts.netloc.partition("@") + else: + host = url_parts.netloc + if userinfo: + if ":" in userinfo: + username, _, password = userinfo.partition(":") + else: + username = userinfo + + # anonymize + username = self.process_username(username) if username else None + password = "PASSWORD" if password else None + + # pack + if username and password and host: + netloc = username + ":" + password + "@" + host + elif username and host: + netloc = username + "@" + host + elif password and host: + netloc = ":" + password + "@" + host + elif host: + netloc = host + else: + netloc = "" + + return urlunsplit((url_parts.scheme, netloc, url_parts.path, url_parts.query, url_parts.fragment)) + + +class OperatingSystem: + """Operating system""" + + WINDOWS = "Windows" + LINUX = "Linux" + MACOSX = "Mac OS" + CYGWIN = "Cygwin" + + @staticmethod + def get_current(): + """Get current operating system""" + if os.name == "nt": + return OperatingSystem.WINDOWS + elif "linux" in sys.platform: + return OperatingSystem.LINUX + elif "darwin" in sys.platform: + return OperatingSystem.MACOSX + elif "cygwin" in sys.platform: + return OperatingSystem.CYGWIN + return None + + +class Architecture: + """Compute architecture""" + + X86_64 = "x86_64" + X86 = "x86" + PPC = "ppc" + ARM = "arm" + + @staticmethod + def get_current(): + """Get architecture""" + return _MACHINE_TO_ARCHITECTURE.get(platform.machine().lower()) + + +_MACHINE_TO_ARCHITECTURE = { + "amd64": Architecture.X86_64, + "x86_64": Architecture.X86_64, + "i686-64": Architecture.X86_64, + "i386": Architecture.X86, + "i686": Architecture.X86, + "x86": Architecture.X86, + "ia64": Architecture.X86, # Itanium is different x64 arch, treat it as the common x86. + "powerpc": Architecture.PPC, + "power macintosh": Architecture.PPC, + "ppc64": Architecture.PPC, + "armv6": Architecture.ARM, + "armv6l": Architecture.ARM, + "arm64": Architecture.ARM, + "armv7": Architecture.ARM, + "armv7l": Architecture.ARM, +} + + +class AirflowInfo: + """All information related to Airflow, system and other.""" + + def __init__(self, anonymizer): + self.airflow_version = airflow_version + self.system = SystemInfo(anonymizer) + self.tools = ToolsInfo(anonymizer) + self.paths = PathsInfo(anonymizer) + self.config = ConfigInfo(anonymizer) + + def __str__(self): + return ( + textwrap.dedent( + """\ + Apache Airflow [{version}] + + {system} + + {tools} + + {paths} + + {config} + """ + ) + .strip() + .format( + version=self.airflow_version, + system=self.system, + tools=self.tools, + paths=self.paths, + config=self.config, + ) + ) + + +class SystemInfo: + """Basic system and python information""" + + def __init__(self, anonymizer): + self.operating_system = OperatingSystem.get_current() + self.arch = Architecture.get_current() + self.uname = platform.uname() + self.locale = locale.getdefaultlocale() + self.python_location = anonymizer.process_path(sys.executable) + self.python_version = sys.version.replace("\n", " ") + + def __str__(self): + return ( + textwrap.dedent( + """\ + Platform: [{os}, {arch}] {uname} + Locale: {locale} + Python Version: [{python_version}] + Python Location: [{python_location}] + """ + ) + .strip() + .format( + os=self.operating_system or "NOT AVAILABLE", + arch=self.arch or "NOT AVAILABLE", + uname=self.uname, + locale=self.locale, + python_version=self.python_version, + python_location=self.python_location, + ) + ) + + +class PathsInfo: + """Path information""" + + def __init__(self, anonymizer): + system_path = os.environ.get("PATH", "").split(os.pathsep) + + self.airflow_home = anonymizer.process_path(get_airflow_home()) + self.system_path = [anonymizer.process_path(p) for p in system_path] + self.python_path = [anonymizer.process_path(p) for p in sys.path] + self.airflow_on_path = any( + os.path.exists(os.path.join(path_elem, "airflow")) for path_elem in system_path + ) + + def __str__(self): + return ( + textwrap.dedent( + """\ + Airflow Home: [{airflow_home}] + System PATH: [{system_path}] + Python PATH: [{python_path}] + airflow on PATH: [{airflow_on_path}] + """ + ) + .strip() + .format( + airflow_home=self.airflow_home, + system_path=os.pathsep.join(self.system_path), + python_path=os.pathsep.join(self.python_path), + airflow_on_path=self.airflow_on_path, + ) + ) + + +class ConfigInfo: + """"Most critical config properties""" + + def __init__(self, anonymizer): + self.executor = conf.get("core", "executor") + self.dags_folder = anonymizer.process_path( + conf.get("core", "dags_folder", fallback="NOT AVAILABLE") + ) + self.plugins_folder = anonymizer.process_path( + conf.get("core", "plugins_folder", fallback="NOT AVAILABLE") + ) + self.base_log_folder = anonymizer.process_path( + conf.get("core", "base_log_folder", fallback="NOT AVAILABLE") + ) + self.sql_alchemy_conn = anonymizer.process_url( + conf.get("core", "SQL_ALCHEMY_CONN", fallback="NOT AVAILABLE") + ) + + def __str__(self): + return ( + textwrap.dedent( + """\ + Executor: [{executor}] + SQL Alchemy Conn: [{sql_alchemy_conn}] + DAGS Folder: [{dags_folder}] + Plugins Folder: [{plugins_folder}] + Base Log Folder: [{base_log_folder}] + """ + ) + .strip() + .format( + executor=self.executor, + sql_alchemy_conn=self.sql_alchemy_conn, + dags_folder=self.dags_folder, + plugins_folder=self.plugins_folder, + base_log_folder=self.base_log_folder, + ) + ) + + +class ToolsInfo: + """The versions of the tools that Airflow uses""" + + def __init__(self, anonymize): + del anonymize # Nothing to anonymize here. + self.git_version = self._get_version(["git", "--version"]) + self.ssh_version = self._get_version(["ssh", "-V"]) + self.kubectl_version = self._get_version(["kubectl", "version", "--short=True", "--client=True"]) + self.gcloud_version = self._get_version(["gcloud", "version"], grep=b"Google Cloud SDK") + self.cloud_sql_proxy_version = self._get_version(["cloud_sql_proxy", "--version"]) + self.mysql_version = self._get_version(["mysql", "--version"]) + self.sqlite3_version = self._get_version(["sqlite3", "--version"]) + self.psql_version = self._get_version(["psql", "--version"]) + + def _get_version(self, cmd, grep=None): + """Return tools version.""" + try: + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + except OSError: + return "NOT AVAILABLE" + stdoutdata, _ = proc.communicate() + data = [f for f in stdoutdata.split(b"\n") if f] + if grep: + data = [line for line in data if grep in line] + if len(data) != 1: + return "NOT AVAILABLE" + else: + return data[0].decode() + + def __str__(self): + return ( + textwrap.dedent( + """\ + git: [{git}] + ssh: [{ssh}] + kubectl: [{kubectl}] + gcloud: [{gcloud}] + cloud_sql_proxy: [{cloud_sql_proxy}] + mysql: [{mysql}] + sqlite3: [{sqlite3}] + psql: [{psql}] + """ + ) + .strip() + .format( + git=self.git_version, + ssh=self.ssh_version, + kubectl=self.kubectl_version, + gcloud=self.gcloud_version, + cloud_sql_proxy=self.cloud_sql_proxy_version, + mysql=self.mysql_version, + sqlite3=self.sqlite3_version, + psql=self.psql_version, + ) + ) + + +class FileIoException(Exception): + """Raises when error happens in FileIo.io integration""" + + +@tenacity.retry( + stop=tenacity.stop_after_attempt(5), + wait=tenacity.wait_exponential(multiplier=1, max=10), + retry=tenacity.retry_if_exception_type(FileIoException), + before=tenacity.before_log(log, logging.DEBUG), + after=tenacity.after_log(log, logging.DEBUG), +) +def _upload_text_to_fileio(content): + """Uload text file to File.io service and return lnk""" + resp = requests.post("https://file.io", files={"file": ("airflow-report.txt", content)}) + if not resp.ok: + raise FileIoException("Failed to send report to file.io service.") + try: + return resp.json()["link"] + except ValueError as e: + log.debug(e) + raise FileIoException("Failed to send report to file.io service.") + + +def _send_report_to_fileio(info): + print("Uploading report to file.io service.") + try: + link = _upload_text_to_fileio(str(info)) + print("Report uploaded.") + print() + print("Link:\t", link) + print() + except FileIoException as ex: + print(str(ex)) + + +def info(args): + """ + Show information related to Airflow, system and other. + """ + # Enforce anonymization, when file_io upload is tuned on. + anonymizer = PiiAnonymizer() if args.anonymize or args.file_io else NullAnonymizer() + info = AirflowInfo(anonymizer) + if args.file_io: + _send_report_to_fileio(info) + else: + print(info) + + +def upgrade_check(args): + sys.exit(""" +Please install apache-airflow-upgrade-check distribution from PyPI to perform upgrade checks +""") + + +# Used in Arg to enable `None' as a distinct value from "not passed" +_UNSET = object() + + +class Arg: + """Class to keep information about command line argument""" + + # pylint: disable=redefined-builtin,unused-argument + def __init__( + self, + flags=_UNSET, + help=_UNSET, + action=_UNSET, + default=_UNSET, + nargs=_UNSET, + type=_UNSET, + choices=_UNSET, + required=_UNSET, + metavar=_UNSET, + ): self.flags = flags - self.help = help - self.action = action - self.default = default - self.nargs = nargs - self.type = type - self.choices = choices - self.metavar = metavar + self.kwargs = {} + for k, v in locals().items(): + if v is _UNSET: + continue + if k in ("self", "flags"): + continue + + self.kwargs[k] = v + + # pylint: enable=redefined-builtin,unused-argument + + def add_to_parser(self, parser): + """Add this argument to an ArgumentParser""" + parser.add_argument(*self.flags, **self.kwargs) + + +def positive_int(value): + """Define a positive int type for an argument.""" + try: + value = int(value) + if value > 0: + return value + except ValueError: + pass + raise argparse.ArgumentTypeError("invalid positive int value: '{}'".format(value)) + + +# Shared +ARG_DAG_ID = Arg(("dag_id",), help="The id of the dag") +ARG_TASK_ID = Arg(("task_id",), help="The id of the task") +ARG_EXECUTION_DATE = Arg(("execution_date",), help="The execution date of the DAG", type=parsedate) +ARG_TASK_REGEX = Arg( + ("-t", "--task-regex"), help="The regex to filter specific task_ids to backfill (optional)" +) +ARG_SUBDIR = Arg( + ("-S", "--subdir"), + help=( + "File location or directory from which to look for the dag. " + "Defaults to '[AIRFLOW_HOME]/dags' where [AIRFLOW_HOME] is the " + "value you set for 'AIRFLOW_HOME' config you set in 'airflow.cfg' " + ), + default='[AIRFLOW_HOME]/dags' if BUILD_DOCS else settings.DAGS_FOLDER, +) +ARG_START_DATE = Arg(("-s", "--start-date"), help="Override start_date YYYY-MM-DD", type=parsedate) +ARG_END_DATE = Arg(("-e", "--end-date"), help="Override end_date YYYY-MM-DD", type=parsedate) +ARG_OUTPUT_PATH = Arg( + ( + "-o", + "--output-path", + ), + help="The output for generated yaml files", + type=str, + default="[CWD]" if BUILD_DOCS else os.getcwd(), +) +ARG_DRY_RUN = Arg( + ("-n", "--dry-run"), + help="Perform a dry run for each task. Only renders Template Fields for each task, nothing else", + action="store_true", +) +ARG_PID = Arg(("--pid",), help="PID file location", nargs='?') +ARG_DAEMON = Arg( + ("-D", "--daemon"), help="Daemonize instead of running in the foreground", action="store_true" +) +ARG_STDERR = Arg(("--stderr",), help="Redirect stderr to this file") +ARG_STDOUT = Arg(("--stdout",), help="Redirect stdout to this file") +ARG_LOG_FILE = Arg(("-l", "--log-file"), help="Location of the log file") +ARG_YES = Arg( + ("-y", "--yes"), help="Do not prompt to confirm reset. Use with care!", action="store_true", default=False +) +ARG_NO_CONFIRM = Arg( + ("-c", "--no_confirm"), help="Do not request confirmation. Use with care!", action="store_true", + default=False +) +# list_dag_runs +ARG_DAG_ID_OPT = Arg(("-d", "--dag-id"), help="The id of the dag") +ARG_NO_BACKFILL = Arg( + ("--no-backfill",), help="filter all the backfill dagruns given the dag id", action="store_true" +) +ARG_STATE = Arg(("--state",), help="Only list the dag runs corresponding to the state") + +# backfill +ARG_MARK_SUCCESS = Arg( + ("-m", "--mark-success"), help="Mark jobs as succeeded without running them", action="store_true" +) +ARG_LOCAL = Arg(("-l", "--local"), help="Run the task using the LocalExecutor", action="store_true") +ARG_POOL = Arg(("--pool",), "Resource pool to use") + +# list_tasks +ARG_TREE = Arg(("-t", "--tree"), help="Tree view", action="store_true") + +# clear +ARG_UPSTREAM = Arg(("-u", "--upstream"), help="Include upstream tasks", action="store_true") +ARG_ONLY_FAILED = Arg(("-f", "--only-failed"), help="Only failed jobs", action="store_true") +ARG_ONLY_RUNNING = Arg(("-r", "--only-running"), help="Only running jobs", action="store_true") +ARG_DOWNSTREAM = Arg(("-d", "--downstream"), help="Include downstream tasks", action="store_true") +ARG_EXCLUDE_SUBDAGS = Arg(("-x", "--exclude-subdags"), help="Exclude subdags", action="store_true") +ARG_EXCLUDE_PARENTDAG = Arg( + ("-X", "--exclude-parentdag"), + help="Exclude ParentDAGS if the task cleared is a part of a SubDAG", + action="store_true", +) +ARG_DAG_REGEX = Arg( + ("-R", "--dag-regex"), help="Search dag_id as regex instead of exact string", action="store_true" +) + +# show_dag +ARG_SAVE = Arg(("-s", "--save"), help="Saves the result to the indicated file.") + +ARG_IMGCAT = Arg(("--imgcat",), help="Displays graph using the imgcat tool.", action='store_true') + +# trigger_dag +ARG_RUN_ID = Arg(("-r", "--run-id"), help="Helps to identify this run") +ARG_CONF = Arg(('-c', '--conf'), help="JSON string that gets pickled into the DagRun's conf attribute") +ARG_EXEC_DATE = Arg(("-e", "--exec-date"), help="The execution date of the DAG", type=parsedate) + +# pool +ARG_POOL_NAME = Arg(("pool",), metavar='NAME', help="Pool name") +ARG_POOL_SLOTS = Arg(("slots",), type=int, help="Pool slots") +ARG_POOL_DESCRIPTION = Arg(("description",), help="Pool description") +ARG_POOL_IMPORT = Arg(("file",), metavar="FILEPATH", help="Import pools from JSON file") +ARG_POOL_EXPORT = Arg(("file",), metavar="FILEPATH", help="Export all pools to JSON file") + +# variables +ARG_VAR = Arg(("key",), help="Variable key") +ARG_VAR_VALUE = Arg(("value",), metavar='VALUE', help="Variable value") +ARG_DEFAULT = Arg( + ("-d", "--default"), metavar="VAL", default=None, help="Default value returned if variable does not exist" +) +ARG_JSON = Arg(("-j", "--json"), help="Deserialize JSON variable", action="store_true") +ARG_VAR_IMPORT = Arg(("file",), help="Import variables from JSON file") +ARG_VAR_EXPORT = Arg(("file",), help="Export all variables to JSON file") + +# kerberos +ARG_PRINCIPAL = Arg(("principal",), help="kerberos principal", nargs='?') +ARG_KEYTAB = Arg(("-k", "--keytab"), help="keytab", nargs='?', default=conf.get('kerberos', 'keytab')) +# run +# TODO(aoen): "force" is a poor choice of name here since it implies it overrides +# all dependencies (not just past success), e.g. the ignore_depends_on_past +# dependency. This flag should be deprecated and renamed to 'ignore_ti_state' and +# the "ignore_all_dependencies" command should be called the"force" command +# instead. +ARG_INTERACTIVE = Arg( + ('-N', '--interactive'), + help='Do not capture standard output and error streams (useful for interactive debugging)', + action='store_true', +) +ARG_FORCE = Arg( + ("-f", "--force"), + help="Ignore previous task instance state, rerun regardless if task already succeeded/failed", + action="store_true", +) +ARG_RAW = Arg(("-r", "--raw"), argparse.SUPPRESS, "store_true") +ARG_IGNORE_ALL_DEPENDENCIES = Arg( + ("-A", "--ignore-all-dependencies"), + help="Ignores all non-critical dependencies, including ignore_ti_state and ignore_task_deps", + action="store_true", +) +# TODO(aoen): ignore_dependencies is a poor choice of name here because it is too +# vague (e.g. a task being in the appropriate state to be run is also a dependency +# but is not ignored by this flag), the name 'ignore_task_dependencies' is +# slightly better (as it ignores all dependencies that are specific to the task), +# so deprecate the old command name and use this instead. +ARG_IGNORE_DEPENDENCIES = Arg( + ("-i", "--ignore-dependencies"), + help="Ignore task-specific dependencies, e.g. upstream, depends_on_past, and retry delay dependencies", + action="store_true", +) +ARG_IGNORE_DEPENDS_ON_PAST = Arg( + ("-I", "--ignore-depends-on-past"), + help="Ignore depends_on_past dependencies (but respect upstream dependencies)", + action="store_true", +) +ARG_SHIP_DAG = Arg( + ("--ship-dag",), help="Pickles (serializes) the DAG and ships it to the worker", action="store_true" +) +ARG_PICKLE = Arg(("-p", "--pickle"), help="Serialized pickle object of the entire dag (used internally)") +ARG_JOB_ID = Arg(("-j", "--job-id"), help=argparse.SUPPRESS) +ARG_CFG_PATH = Arg(("--cfg-path",), help="Path to config file to use instead of airflow.cfg") + +# worker +ARG_QUEUES = Arg( + ("-q", "--queues"), + help="Comma delimited list of queues to serve", + default=conf.get('celery', 'DEFAULT_QUEUE'), +) +ARG_CONCURRENCY = Arg( + ("-c", "--concurrency"), + type=int, + help="The number of worker processes", + default=conf.get('celery', 'worker_concurrency'), +) +ARG_CELERY_HOSTNAME = Arg( + ("-H", "--celery-hostname"), + help="Set the hostname of celery worker if you have multiple workers on a single machine", +) + +# flower +ARG_BROKER_API = Arg(("-a", "--broker-api"), help="Broker API") +ARG_FLOWER_HOSTNAME = Arg( + ("-H", "--hostname"), + default=conf.get('celery', 'FLOWER_HOST'), + help="Set the hostname on which to run the server", +) +ARG_FLOWER_PORT = Arg( + ("-p", "--port"), + default=conf.get('celery', 'FLOWER_PORT'), + type=int, + help="The port on which to run the server", +) +ARG_FLOWER_CONF = Arg(("-c", "--flower-conf"), help="Configuration file for flower") +ARG_FLOWER_URL_PREFIX = Arg( + ("-u", "--url-prefix"), default=conf.get('celery', 'FLOWER_URL_PREFIX'), help="URL prefix for Flower" +) +ARG_FLOWER_BASIC_AUTH = Arg( + ("-A", "--basic-auth"), + default=conf.get('celery', 'FLOWER_BASIC_AUTH'), + help=( + "Securing Flower with Basic Authentication. " + "Accepts user:password pairs separated by a comma. " + "Example: flower_basic_auth = user1:password1,user2:password2" + ), +) +ARG_TASK_PARAMS = Arg(("-t", "--task-params"), help="Sends a JSON params dict to the task") +ARG_POST_MORTEM = Arg( + ("-m", "--post-mortem"), action="store_true", help="Open debugger on uncaught exception" +) + +# connections +ARG_CONN_ID = Arg(('conn_id',), help='Connection id, required to get/add/delete a connection', type=str) +ARG_CONN_URI = Arg( + ('--conn-uri',), help='Connection URI, required to add a connection without conn_type', type=str +) +ARG_CONN_TYPE = Arg( + ('--conn-type',), help='Connection type, required to add a connection without conn_uri', type=str +) +ARG_CONN_HOST = Arg(('--conn-host',), help='Connection host, optional when adding a connection', type=str) +ARG_CONN_LOGIN = Arg(('--conn-login',), help='Connection login, optional when adding a connection', type=str) +ARG_CONN_PASSWORD = Arg( + ('--conn-password',), help='Connection password, optional when adding a connection', type=str +) +ARG_CONN_SCHEMA = Arg( + ('--conn-schema',), help='Connection schema, optional when adding a connection', type=str +) +ARG_CONN_PORT = Arg(('--conn-port',), help='Connection port, optional when adding a connection', type=str) +ARG_CONN_EXTRA = Arg( + ('--conn-extra',), help='Connection `Extra` field, optional when adding a connection', type=str +) + +# users +ARG_USERNAME = Arg(('-u', '--username'), help='Username of the user', required=True, type=str) +ARG_FIRSTNAME = Arg(('-f', '--firstname'), help='First name of the user', required=True, type=str) +ARG_LASTNAME = Arg(('-l', '--lastname'), help='Last name of the user', required=True, type=str) +ARG_ROLE = Arg( + ('-r', '--role'), + help='Role of the user. Existing roles include Admin, User, Op, Viewer, and Public', + required=True, + type=str, +) +ARG_EMAIL = Arg(('-e', '--email'), help='Email of the user', required=True, type=str) +ARG_PASSWORD = Arg( + ('-p', '--password'), + help='Password of the user, required to create a user without --use-random-password', + type=str, +) +ARG_USE_RANDOM_PASSWORD = Arg( + ('--use-random-password',), + help='Do not prompt for password. Use random string instead.' + ' Required to create a user without --password ', + default=False, + action='store_true', +) + +# roles +ARG_AUTOSCALE = Arg(('-a', '--autoscale'), help="Minimum and Maximum number of worker to autoscale") +ARG_SKIP_SERVE_LOGS = Arg( + ("-s", "--skip-serve-logs"), + default=False, + help="Don't start the serve logs process along with the workers", + action="store_true", +) + +# kubernetes cleanup-pods +ARG_NAMESPACE = Arg( + ("--namespace",), + default='default', + help="Kubernetes Namespace", +) + +ALTERNATIVE_CONN_SPECS_ARGS = [ + ARG_CONN_TYPE, + ARG_CONN_HOST, + ARG_CONN_LOGIN, + ARG_CONN_PASSWORD, + ARG_CONN_SCHEMA, + ARG_CONN_PORT, +] + +# A special "argument" (that is hidden from help) that sets `args.deprecation_warning=False` in the resulting +# Namespace. Add this to any commands that use the same implementation function in new and old names to +# supresses the warning for the new form. +NOT_DEPRECATED = Arg(("--deprecation_warning",), help=argparse.SUPPRESS, default=False, required=False) + +_ActionCommand = namedtuple('ActionCommand', ['name', 'help', 'func', 'args', 'description', 'epilog', + 'prog']) +_GroupCommand = namedtuple('GroupCommand', ['name', 'help', 'subcommands', 'description', 'epilog']) + +_ActionCommand.__new__.__defaults__ = (None,) * len(_ActionCommand._fields) # type: ignore +_GroupCommand.__new__.__defaults__ = (None,) * len(_GroupCommand._fields) # type: ignore + +ActionCommand = cast(Any, _ActionCommand) +GroupCommand = cast(Any, _GroupCommand) + + +DAGS_COMMANDS = ( + ActionCommand( + name='list', + help="List all the DAGs", + func=list_dags, + args=(ARG_SUBDIR, NOT_DEPRECATED), + ), + ActionCommand( + name='report', + help='Show DagBag loading report', + func=list_dags_report, + args=(ARG_SUBDIR, NOT_DEPRECATED), + ), + ActionCommand( + name='list-runs', + help="List DAG runs given a DAG id", + description=( + "List DAG runs given a DAG id. If state option is given, it will only search for all the " + "dagruns with the given state. If no_backfill option is given, it will filter out all " + "backfill dagruns for given dag id. If start_date is given, it will filter out all the " + "dagruns that were executed before this date. If end_date is given, it will filter out " + "all the dagruns that were executed after this date. " + ), + func=list_dag_runs, + args=(ARG_DAG_ID_OPT, ARG_NO_BACKFILL, ARG_STATE, NOT_DEPRECATED), + ), + ActionCommand( + name='state', + help="Get the status of a dag run", + func=dag_state, + args=(ARG_DAG_ID, ARG_EXECUTION_DATE, ARG_SUBDIR, NOT_DEPRECATED), + ), + ActionCommand( + name='next-execution', + help="Get the next execution datetimes of a DAG", + description=( + "Get the next execution datetimes of a DAG. It returns one execution unless the " + "num-executions option is given" + ), + func=next_execution, + args=(ARG_DAG_ID, ARG_SUBDIR, NOT_DEPRECATED), + ), + ActionCommand( + name='pause', + help='Pause a DAG', + func=pause, + args=(ARG_DAG_ID, ARG_SUBDIR, NOT_DEPRECATED), + ), + ActionCommand( + name='unpause', + help='Resume a paused DAG', + func=unpause, + args=(ARG_DAG_ID, ARG_SUBDIR, NOT_DEPRECATED), + ), + ActionCommand( + name='trigger', + help='Trigger a DAG run', + func=trigger_dag, + args=(ARG_DAG_ID, ARG_SUBDIR, ARG_RUN_ID, ARG_CONF, ARG_EXEC_DATE, NOT_DEPRECATED), + ), + ActionCommand( + name='delete', + help="Delete all DB records related to the specified DAG", + func=delete_dag, + args=(ARG_DAG_ID, ARG_YES, NOT_DEPRECATED), + ), + ActionCommand( + name='show', + help="Displays DAG's tasks with their dependencies", + description=( + "The --imgcat option only works in iTerm.\n" + "\n" + "For more information, see: https://www.iterm2.com/documentation-images.html\n" + "\n" + "The --save option saves the result to the indicated file.\n" + "\n" + "The file format is determined by the file extension. " + "For more information about supported " + "format, see: https://www.graphviz.org/doc/info/output.html\n" + "\n" + "If you want to create a PNG file then you should execute the following command:\n" + "airflow dags show --save output.png\n" + "\n" + "If you want to create a DOT file then you should execute the following command:\n" + "airflow dags show --save output.dot\n" + ), + func=show_dag, + args=( + ARG_DAG_ID, + ARG_SUBDIR, + ARG_SAVE, + ARG_IMGCAT, + NOT_DEPRECATED, + ), + ), +) +TASKS_COMMANDS = ( + ActionCommand( + name='list', + help="List the tasks within a DAG", + func=list_tasks, + args=(ARG_DAG_ID, ARG_TREE, ARG_SUBDIR, NOT_DEPRECATED), + ), + ActionCommand( + name='clear', + help="Clear a set of task instance, as if they never ran", + func=clear, + args=( + ARG_DAG_ID, + ARG_TASK_REGEX, + ARG_START_DATE, + ARG_END_DATE, + ARG_SUBDIR, + ARG_UPSTREAM, + ARG_DOWNSTREAM, + ARG_YES, + ARG_NO_CONFIRM, + ARG_ONLY_FAILED, + ARG_ONLY_RUNNING, + ARG_EXCLUDE_SUBDAGS, + ARG_EXCLUDE_PARENTDAG, + ARG_DAG_REGEX, + NOT_DEPRECATED, + ), + ), + ActionCommand( + name='state', + help="Get the status of a task instance", + func=task_state, + args=(ARG_DAG_ID, ARG_TASK_ID, ARG_EXECUTION_DATE, ARG_SUBDIR, NOT_DEPRECATED), + ), + ActionCommand( + name='failed-deps', + help="Returns the unmet dependencies for a task instance", + description=( + "Returns the unmet dependencies for a task instance from the perspective of the scheduler. " + "In other words, why a task instance doesn't get scheduled and then queued by the scheduler, " + "and then run by an executor." + ), + func=task_failed_deps, + args=(ARG_DAG_ID, ARG_TASK_ID, ARG_EXECUTION_DATE, ARG_SUBDIR, NOT_DEPRECATED), + ), + ActionCommand( + name='render', + help="Render a task instance's template(s)", + func=render, + args=(ARG_DAG_ID, ARG_TASK_ID, ARG_EXECUTION_DATE, ARG_SUBDIR, NOT_DEPRECATED), + ), + ActionCommand( + name='run', + help="Run a single task instance", + func=run, + args=( + ARG_DAG_ID, + ARG_TASK_ID, + ARG_EXECUTION_DATE, + ARG_SUBDIR, + ARG_MARK_SUCCESS, + ARG_FORCE, + ARG_POOL, + ARG_CFG_PATH, + ARG_LOCAL, + ARG_RAW, + ARG_IGNORE_ALL_DEPENDENCIES, + ARG_IGNORE_DEPENDENCIES, + ARG_IGNORE_DEPENDS_ON_PAST, + ARG_SHIP_DAG, + ARG_PICKLE, + ARG_JOB_ID, + ARG_INTERACTIVE, + NOT_DEPRECATED, + ), + ), + ActionCommand( + name='test', + help="Test a task instance", + description=( + "Test a task instance. This will run a task without checking for dependencies or recording " + "its state in the database" + ), + func=test, + args=( + ARG_DAG_ID, + ARG_TASK_ID, + ARG_EXECUTION_DATE, + ARG_SUBDIR, + ARG_DRY_RUN, + ARG_TASK_PARAMS, + ARG_POST_MORTEM, + NOT_DEPRECATED, + ), + ), +) +POOLS_COMMANDS = ( + ActionCommand( + name='list', + help='List pools', + func=pool_list, + args=(NOT_DEPRECATED,), + ), + ActionCommand( + name='get', + help='Get pool size', + func=pool_get, + args=( + ARG_POOL_NAME, + NOT_DEPRECATED, + ), + ), + ActionCommand( + name='set', + help='Configure pool', + func=pool_set, + args=( + ARG_POOL_NAME, + ARG_POOL_SLOTS, + ARG_POOL_DESCRIPTION, + NOT_DEPRECATED, + ), + ), + ActionCommand( + name='delete', + help='Delete pool', + func=pool_delete, + args=( + ARG_POOL_NAME, + NOT_DEPRECATED, + ), + ), + ActionCommand( + name='import', + help='Import pools', + func=pool_import, + args=( + ARG_POOL_IMPORT, + NOT_DEPRECATED, + ), + ), + ActionCommand( + name='export', + help='Export all pools', + func=pool_import, + args=( + ARG_POOL_EXPORT, + NOT_DEPRECATED, + ), + ), +) +VARIABLES_COMMANDS = ( + ActionCommand( + name='list', + help='List variables', + func=variables_list, + args=(NOT_DEPRECATED,), + ), + ActionCommand( + name='get', + help='Get variable', + func=variables_get, + args=(ARG_VAR, ARG_JSON, ARG_DEFAULT, NOT_DEPRECATED), + ), + ActionCommand( + name='set', + help='Set variable', + func=variables_set, + args=(ARG_VAR, ARG_VAR_VALUE, NOT_DEPRECATED), + ), + ActionCommand( + name='delete', + help='Delete variable', + func=variables_delete, + args=(ARG_VAR, NOT_DEPRECATED), + ), + ActionCommand( + name='import', + help='Import variables', + func=variables_import, + args=(ARG_VAR_IMPORT, NOT_DEPRECATED), + ), + ActionCommand( + name='export', + help='Export all variables', + func=variables_export, + args=(ARG_VAR_EXPORT, NOT_DEPRECATED), + ), +) +DB_COMMANDS = ( + ActionCommand( + name='init', + help="Initialize the metadata database", + func=initdb, + args=(NOT_DEPRECATED,), + ), + ActionCommand( + name='reset', + help="Burn down and rebuild the metadata database", + func=resetdb, + args=(ARG_YES, NOT_DEPRECATED), + ), + ActionCommand( + name='upgrade', + help="Upgrade the metadata database to latest version", + func=upgradedb, + args=(NOT_DEPRECATED,), + ), + ActionCommand( + name='shell', + help="Runs a shell to access the database", + func=shell, + args=(NOT_DEPRECATED,), + ), + ActionCommand( + name='check', + help="Check if the database can be reached", + func=checkdb, + args=(NOT_DEPRECATED,), + ), +) +CONNECTIONS_COMMANDS = ( + ActionCommand( + name='list', + help='List connections', + func=connections_list, + args=(NOT_DEPRECATED,), + ), + ActionCommand( + name='add', + help='Add a connection', + func=connections_add, + args=(ARG_CONN_ID, ARG_CONN_URI, ARG_CONN_EXTRA, NOT_DEPRECATED) + tuple(ALTERNATIVE_CONN_SPECS_ARGS), + ), + ActionCommand( + name='delete', + help='Delete a connection', + func=connections_delete, + args=(ARG_CONN_ID, NOT_DEPRECATED), + ), +) + +USERS_COMMANDS = ( + ActionCommand( + name='list', + help='List users', + func=list_users, + args=(NOT_DEPRECATED,), + ), + ActionCommand( + name='create', + help='Create a user', + func=create_user, + args=( + ARG_ROLE, + ARG_USERNAME, + ARG_EMAIL, + ARG_FIRSTNAME, + ARG_LASTNAME, + ARG_PASSWORD, + ARG_USE_RANDOM_PASSWORD, + NOT_DEPRECATED, + ), + epilog=( + 'examples:\n' + 'To create an user with "Admin" role and username equals to "admin", run:\n' + '\n' + ' $ airflow users create \\\n' + ' --username admin \\\n' + ' --firstname FIRST_NAME \\\n' + ' --lastname LAST_NAME \\\n' + ' --role Admin \\\n' + ' --email admin@example.org' + ), + ), + ActionCommand( + name='delete', + help='Delete a user', + func=delete_user, + args=(ARG_USERNAME, NOT_DEPRECATED), + ), +) + +CELERY_COMMANDS = ( + ActionCommand( + name='worker', + help="Start a Celery worker node", + func=worker, + args=( + ARG_QUEUES, + ARG_CONCURRENCY, + ARG_CELERY_HOSTNAME, + ARG_PID, + ARG_DAEMON, + ARG_STDOUT, + ARG_STDERR, + ARG_LOG_FILE, + ARG_AUTOSCALE, + ARG_SKIP_SERVE_LOGS, + NOT_DEPRECATED, + ), + ), + ActionCommand( + name='flower', + help="Start a Celery Flower", + func=flower, + args=( + ARG_FLOWER_HOSTNAME, + ARG_FLOWER_PORT, + ARG_FLOWER_CONF, + ARG_FLOWER_URL_PREFIX, + ARG_FLOWER_BASIC_AUTH, + ARG_BROKER_API, + ARG_PID, + ARG_DAEMON, + ARG_STDOUT, + ARG_STDERR, + ARG_LOG_FILE, + NOT_DEPRECATED, + ), + ), +) + +CONFIG_COMMANDS = ( + ActionCommand( + name='list', + help='List options for the configuration', + func=config, + args=(NOT_DEPRECATED,), + ), +) + +KUBERNETES_COMMANDS = ( + ActionCommand( + name='cleanup-pods', + help="Clean up Kubernetes pods in evicted/failed/succeeded states", + func=cleanup_pods, + args=(ARG_NAMESPACE, ), + ), + ActionCommand( + name='generate-dag-yaml', + help="Generate YAML files for all tasks in DAG. Useful for debugging tasks without " + "launching into a cluster", + func=kubernetes_generate_dag_yaml, + args=(ARG_DAG_ID, ARG_EXECUTION_DATE, ARG_SUBDIR, ARG_OUTPUT_PATH, NOT_DEPRECATED), + ), +) + +airflow_commands = [ + GroupCommand( + name='dags', + help='Manage DAGs', + subcommands=DAGS_COMMANDS, + ), + GroupCommand( + name="kubernetes", help='Tools to help run the KubernetesExecutor', subcommands=KUBERNETES_COMMANDS + ), + GroupCommand( + name='tasks', + help='Manage tasks', + subcommands=TASKS_COMMANDS, + ), + GroupCommand( + name='pools', + help="Manage pools", + subcommands=POOLS_COMMANDS, + ), + GroupCommand( + name='variables', + help="Manage variables", + subcommands=VARIABLES_COMMANDS, + ), + GroupCommand( + name='db', + help="Database operations", + subcommands=DB_COMMANDS, + ), + ActionCommand( + name='kerberos', + help="Start a kerberos ticket renewer", + func=kerberos, + args=(ARG_PRINCIPAL, ARG_KEYTAB, ARG_PID, ARG_DAEMON, ARG_STDOUT, ARG_STDERR, ARG_LOG_FILE), + ), + GroupCommand( + name='connections', + help="Manage connections", + subcommands=CONNECTIONS_COMMANDS, + ), + GroupCommand( + name='users', + help="Manage users", + subcommands=USERS_COMMANDS, + ), + ActionCommand( + name='sync-perm', + help="Update permissions for existing roles and DAGs", + func=sync_perm, + args=(NOT_DEPRECATED,), + ), + ActionCommand( + name='rotate-fernet-key', + func=rotate_fernet_key, + help='Rotate encrypted connection credentials and variables', + description=( + 'Rotate all encrypted connection credentials and variables; see ' + 'https://airflow.apache.org/docs/stable/howto/secure-connections.html' + '#rotating-encryption-keys' + ), + args=(NOT_DEPRECATED,), + ), + GroupCommand(name="config", help='View configuration', subcommands=CONFIG_COMMANDS), + GroupCommand( + name="celery", + help='Celery components', + description=( + 'Start celery components. Works only when using CeleryExecutor. For more information, see ' + 'https://airflow.apache.org/docs/stable/executor/celery.html' + ), + subcommands=CELERY_COMMANDS, + ), +] +ALL_COMMANDS_DICT = {sp.name: sp for sp in airflow_commands} +DAG_CLI_COMMANDS = {'list_tasks', 'backfill', 'test', 'run', 'pause', 'unpause', 'list_dag_runs'} + + +class AirflowHelpFormatter(argparse.HelpFormatter): + """ + Custom help formatter to display help message. + + It displays simple commands and groups of commands in separate sections. + """ + + def _format_action(self, action): + if isinstance(action, argparse._SubParsersAction): # pylint: disable=protected-access + + parts = [] + action_header = self._format_action_invocation(action) + action_header = '%*s%s\n' % (self._current_indent, '', action_header) + parts.append(action_header) + + self._indent() + subactions = action._get_subactions() # pylint: disable=protected-access + + action_subcommands, group_subcommands = partition( + lambda d: isinstance(ALL_COMMANDS_DICT.get(d.dest, None), GroupCommand), subactions + ) + # Remove deprecated groups from the list -- we don't want to show them + parts.append("\n") + parts.append('%*s%s:\n' % (self._current_indent, '', "Groups")) + self._indent() + for subaction in group_subcommands: + parts.append(self._format_action(subaction)) + self._dedent() + + parts.append("\n") + parts.append('%*s%s:\n' % (self._current_indent, '', "Commands")) + self._indent() + + for subaction in action_subcommands: + if getattr(action.choices[subaction.dest], 'hide_from_toplevel_help', False): + continue + parts.append(self._format_action(subaction)) + self._dedent() + self._dedent() + + # return a single string + return self._join_parts(parts) + + return super(AirflowHelpFormatter, self)._format_action(action) + + +def partition(pred, iterable): + iter_1, iter_2 = itertools.tee(iterable) + return itertools.filterfalse(pred, iter_1), filter(pred, iter_2) + + +def _sort_args(args): + """Sort subcommand optional args, keep positional args""" + + def get_long_option(arg): + """Get long option from Arg.flags""" + return arg.flags[0] if len(arg.flags) == 1 else arg.flags[1] + + positional, optional = partition(lambda x: x.flags[0].startswith("-"), args) + for p in positional: + yield p + for o in sorted(optional, key=lambda x: get_long_option(x).lower()): + yield o + + +def _add_command(subparsers, sub): + + sub_proc = subparsers.add_parser( + sub.name, help=sub.help, description=sub.description or sub.help, epilog=sub.epilog, + ) + sub_proc.formatter_class = argparse.RawTextHelpFormatter + + if isinstance(sub, GroupCommand): + return _add_group_command(sub, sub_proc) + elif isinstance(sub, ActionCommand): + if sub.prog: + sub_proc.prog = sub.prog + return _add_action_command(sub, sub_proc) + else: + raise AirflowException("Invalid command definition.") + + +def _add_action_command(sub, sub_proc): + for arg in _sort_args(sub.args): + arg.add_to_parser(sub_proc) + sub_proc.set_defaults(func=sub.func) + + +def _add_group_command(sub, sub_proc): + subcommands = sub.subcommands + sub_subparsers = sub_proc.add_subparsers(dest="subcommand", metavar="COMMAND") + sub_subparsers.required = True + + for command in sorted(subcommands, key=lambda x: x.name): + _add_command(sub_subparsers, command) + return sub_proc, sub_subparsers class CLIFactory(object): @@ -1581,6 +3453,11 @@ class CLIFactory(object): 'execution_date': Arg( ("execution_date",), help="The execution date of the DAG", type=parsedate), + 'output_path': Arg( + ('-o', '--output-path'), + help="output path for yaml file", + default=os.getcwd() + ), 'task_regex': Arg( ("-t", "--task_regex"), "The regex to filter specific task_ids to backfill (optional)"), @@ -1597,7 +3474,10 @@ class CLIFactory(object): ("-e", "--end_date"), "Override end_date YYYY-MM-DD", type=parsedate), 'dry_run': Arg( - ("-dr", "--dry_run"), "Perform a dry run", "store_true"), + ("-dr", "--dry_run"), + "Perform a dry run for each task. Only renders Template Fields " + "for each task, nothing else", + "store_true"), 'pid': Arg( ("--pid",), "PID file location", nargs='?'), @@ -1721,16 +3601,16 @@ class CLIFactory(object): # show_dag 'save': Arg( ("-s", "--save"), - "Saves the result to the indicated file.\n" + "Saves the result to the indicated file. The file format is determined by the file extension.\n" "\n" - "The file format is determined by the file extension. For more information about supported " - "format, see: https://www.graphviz.org/doc/info/output.html\n" + "To see more information about supported format for show_dags command, see: " + "https://www.graphviz.org/doc/info/output.html\n" "\n" "If you want to create a PNG file then you should execute the following command:\n" - "airflow dags show --save output.png\n" + "airflow show_dag --save output.png\n" "\n" "If you want to create a DOT file then you should execute the following command:\n" - "airflow dags show --save output.dot\n" + "airflow show_dag --save output.dot\n" ), 'imgcat': Arg( ("--imgcat", ), @@ -2047,6 +3927,27 @@ class CLIFactory(object): default=False, help="Don't start the serve logs process along with the workers.", action="store_true"), + 'color': Arg( + ('--color',), + help="Do emit colored output (default: auto)", + choices={cli_utils.ColorMode.ON, cli_utils.ColorMode.OFF, cli_utils.ColorMode.AUTO}, + default=cli_utils.ColorMode.AUTO), + # info + 'anonymize': Arg( + ('--anonymize',), + help=( + 'Minimize any personal identifiable information. ' + 'Use it when sharing output with others.' + ), + action='store_true' + ), + 'file_io': Arg( + ('--file-io',), + help=( + 'Send output to file.io service and returns link.' + ), + action='store_true' + ) } subparsers = ( { @@ -2067,6 +3968,40 @@ class CLIFactory(object): 'reset_dag_run', 'rerun_failed_tasks', 'run_backwards' ) }, { + 'func': generate_pod_template, + 'help': "Reads your airflow.cfg and migrates your configurations into a " + "airflow_template.yaml file. From this point a user can link" + "this file to airflow using the `pod_template_file` argument" + "and modify using the Kubernetes API", + 'args': ('output_path',), + }, { + 'func': serve_logs, + 'help': "Serve logs generate by worker", + 'args': tuple(), + }, { + 'func': scheduler, + 'help': "Start a scheduler instance", + 'args': ('dag_id_opt', 'subdir', 'run_duration', 'num_runs', + 'do_pickle', 'pid', 'daemon', 'stdout', 'stderr', + 'log_file'), + }, { + 'func': webserver, + 'help': "Start a Airflow webserver instance", + 'args': ('port', 'workers', 'workerclass', 'worker_timeout', 'hostname', + 'pid', 'daemon', 'stdout', 'stderr', 'access_logfile', + 'error_logfile', 'log_file', 'ssl_cert', 'ssl_key', 'debug'), + }, { + 'help': 'Show the version', + 'func': version, + 'args': tuple(), + }, { + 'help': 'Show information about current Airflow and environment', + 'func': info, + 'args': ('anonymize', 'file_io', ), + }, + ) + deprecated_subparsers = ( + { 'func': list_dag_runs, 'help': "List dag runs given a DAG id. If state option is given, it will only" "search for all the dagruns with the given state. " @@ -2079,12 +4014,21 @@ class CLIFactory(object): 'func': list_tasks, 'help': "List the tasks within a DAG", 'args': ('dag_id', 'tree', 'subdir'), + }, { + 'func': kubernetes_generate_dag_yaml, + 'help': "List dag runs given a DAG id. If state option is given, it will only" + "search for all the dagruns with the given state. " + "If no_backfill option is given, it will filter out" + "all backfill dagruns for given dag id.", + 'args': ( + 'dag_id', 'output_path', 'subdir', 'execution_date' + ) }, { 'func': clear, 'help': "Clear a set of task instance, as if they never ran", 'args': ( 'dag_id', 'task_regex', 'start_date', 'end_date', 'subdir', - 'upstream', 'downstream', 'no_confirm', 'only_failed', + 'upstream', 'downstream', 'no_confirm', 'only_failed', 'yes', 'only_running', 'exclude_subdags', 'exclude_parentdag', 'dag_regex'), }, { 'func': pause, @@ -2115,11 +4059,6 @@ class CLIFactory(object): 'help': "CRUD operations on variables", "args": ('set', 'get', 'json', 'default', 'var_import', 'var_export', 'var_delete'), - }, { - 'func': kerberos, - 'help': "Start a kerberos ticket renewer", - 'args': ('principal', 'keytab', 'pid', - 'daemon', 'stdout', 'stderr', 'log_file'), }, { 'func': render, 'help': "Render a task instance's template(s)", @@ -2156,10 +4095,6 @@ class CLIFactory(object): 'func': task_state, 'help': "Get the status of a task instance", 'args': ('dag_id', 'task_id', 'execution_date', 'subdir'), - }, { - 'func': serve_logs, - 'help': "Serve logs generate by worker", - 'args': tuple(), }, { 'func': test, 'help': ( @@ -2168,12 +4103,6 @@ class CLIFactory(object): 'args': ( 'dag_id', 'task_id', 'execution_date', 'subdir', 'dry_run', 'task_params', 'post_mortem'), - }, { - 'func': webserver, - 'help': "Start a Airflow webserver instance", - 'args': ('port', 'workers', 'workerclass', 'worker_timeout', 'hostname', - 'pid', 'daemon', 'stdout', 'stderr', 'access_logfile', - 'error_logfile', 'log_file', 'ssl_cert', 'ssl_key', 'debug'), }, { 'func': resetdb, 'help': "Burn down and rebuild the metadata database", @@ -2190,12 +4119,6 @@ class CLIFactory(object): 'func': shell, 'help': "Runs a shell to access the database", 'args': tuple(), - }, { - 'func': scheduler, - 'help': "Start a scheduler instance", - 'args': ('dag_id_opt', 'subdir', 'run_duration', 'num_runs', - 'do_pickle', 'pid', 'daemon', 'stdout', 'stderr', - 'log_file'), }, { 'func': worker, 'help': "Start a Celery worker node", @@ -2206,10 +4129,6 @@ class CLIFactory(object): 'help': "Start a Celery Flower", 'args': ('flower_hostname', 'flower_port', 'flower_conf', 'flower_url_prefix', 'flower_basic_auth', 'broker_api', 'pid', 'daemon', 'stdout', 'stderr', 'log_file'), - }, { - 'func': version, - 'help': "Show the version", - 'args': tuple(), }, { 'func': connections, 'help': "List/Add/Delete connections", @@ -2246,37 +4165,114 @@ class CLIFactory(object): '#rotating-encryption-keys.', 'args': (), }, + { + 'help': 'Show current application configuration', + 'func': config, + 'args': ('color', ), + }, + { + 'name': 'upgrade_check', + 'help': 'Check if you can safely upgrade to the new version.', + 'func': upgrade_check, + 'from_module': 'airflow.upgrade.checker', + 'args': (), + }, ) - subparsers_dict = {sp['func'].__name__: sp for sp in subparsers} - dag_subparsers = ( + deprecated_dag_subparsers = ( 'list_tasks', 'backfill', 'test', 'run', 'pause', 'unpause', 'list_dag_runs') @classmethod def get_parser(cls, dag_parser=False): """Creates and returns command line argument parser""" + + deprecated_subparsers_dict = {sp['func'].__name__: sp for sp in cls.deprecated_subparsers} + class DefaultHelpParser(argparse.ArgumentParser): """Override argparse.ArgumentParser.error and use print_help instead of print_usage""" def error(self, message): self.print_help() self.exit(2, '\n{} command error: {}, see help above.\n'.format(self.prog, message)) - parser = DefaultHelpParser() - subparsers = parser.add_subparsers( - help='sub-command help', dest='subcommand') + + def parse_known_args(self, args, namespace): + # Compat hack for optional sub-arguments in Py 2.7 + fake_opt = getattr(self, "_fake_optional_subparser", False) and \ + (args == [] or args[0].startswith('-')) + if fake_opt: + args = ["deprecated_"] + args + + args, remain = super(DefaultHelpParser, self).parse_known_args(args, namespace) + + if fake_opt: + # So it doesn't show up as "deprecated_" + args.subcommand = self._fake_optional_subparser + return args, remain + + parser = DefaultHelpParser(formatter_class=AirflowHelpFormatter) + subparsers = parser.add_subparsers(dest='subcommand', metavar="GROUP_OR_COMMAND") subparsers.required = True - subparser_list = cls.dag_subparsers if dag_parser else cls.subparsers_dict.keys() - for sub in subparser_list: - sub = cls.subparsers_dict[sub] + subparser_list = DAG_CLI_COMMANDS if dag_parser else ALL_COMMANDS_DICT.keys() + for sub_name in sorted(subparser_list): + action = _add_command(subparsers, ALL_COMMANDS_DICT[sub_name]) + + # Deprecated "mode select", and new sub-command version? Merge them + # so they both work, but don't show help for the deprecated + # options! + if sub_name in deprecated_subparsers_dict and action is not None: + sp, sub_subparsers = action + deprecated = deprecated_subparsers_dict.pop(sub_name) + sp.set_defaults(func=deprecated['func']) + if six.PY3: + sub_subparsers.required = False + + for arg in deprecated['args']: + if 'dag_id' in arg and dag_parser: + continue + arg = cls.args[arg] + # Don't show these options in the help output + kwargs = arg.kwargs.copy() + kwargs['help'] = argparse.SUPPRESS + sp.add_argument(*arg.flags, **kwargs) + else: + # Py2 doesn't support optional subcommands, so we have to fake it + sp._fake_optional_subparser = sub_name + _add_command(sub_subparsers, ActionCommand( + prog=sp.prog, + name='deprecated_', + help=deprecated['help'], + func=deprecated['func'], + args=(cls.args[arg] for arg in deprecated['args']), + )) + + if dag_parser: + subparser_list = [ + (deprecated_subparsers_dict[name], False) + for name in cls.deprecated_dag_subparsers + ] + else: + current = zip(cls.subparsers, itertools.repeat(False)) + deprecated = zip(deprecated_subparsers_dict.values(), itertools.repeat(True)) + subparser_list = itertools.chain(current, deprecated) + for (sub, hide_from_toplevel_help) in subparser_list: + if hide_from_toplevel_help and BUILD_DOCS: + # Don't show the deprecated commands in the docs + continue + sp = subparsers.add_parser(sub['func'].__name__, help=sub['help']) + sp.hide_from_toplevel_help = hide_from_toplevel_help + sp.set_defaults(func=sub['func']) + if 'from_module' in sub: + try: + mod = importlib.import_module(sub['from_module']) + mod.register_arguments(sp) + continue + except ImportError: + pass for arg in sub['args']: if 'dag_id' in arg and dag_parser: continue arg = cls.args[arg] - kwargs = { - f: v - for f, v in vars(arg).items() if f != 'flags' and v} - sp.add_argument(*arg.flags, **kwargs) - sp.set_defaults(func=sub['func']) + sp.add_argument(*arg.flags, **arg.kwargs) return parser diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index 4ff899a2a8c79..4040131d001ea 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -254,6 +254,16 @@ type: string example: ~ default: "" + - name: sql_alchemy_connect_args + description: | + Import path for connect args in SqlAlchemy. Default to an empty dict. + This is useful when you want to configure db engine args that SqlAlchemy won't parse + in connection string. + See https://docs.sqlalchemy.org/en/13/core/engines.html#sqlalchemy.create_engine.params.connect_args + version_added: 1.10.11 + type: string + example: ~ + default: ~ - name: parallelism description: | The amount of parallelism as a setting to the executor. This defines @@ -437,6 +447,14 @@ type: string example: ~ default: "30" + - name: min_serialized_dag_fetch_interval + description: | + Fetching serialized DAG can not be faster than a minimum interval to reduce database + read rate. This config controls when your DAGs are updated in the Webserver + version_added: 1.10.12 + type: string + example: ~ + default: "10" - name: store_dag_code description: | Whether to persist DAG files code in DB. @@ -445,8 +463,8 @@ ``store_serialized_dags`` setting. version_added: 1.10.10 type: string - example: ~ - default: "%(store_serialized_dags)s" + example: "False" + default: ~ - name: max_num_rendered_ti_fields_per_task description: | Maximum number of Rendered Task Instance Fields (Template Fields) per task to store @@ -466,6 +484,13 @@ type: string example: ~ default: "True" + - name: xcom_backend + description: | + Path to custom XCom class that will be used to store and resolve operators results + version_added: 1.10.12 + type: string + example: "path.to.CustomXCom" + default: "airflow.models.xcom.BaseXCom" - name: secrets description: ~ @@ -524,11 +549,13 @@ options: - name: auth_backend description: | - How to authenticate users of the API + How to authenticate users of the API. See + https://airflow.apache.org/docs/stable/security.html for possible values. + ("airflow.api.auth.backend.default" allows all requests for historic reasons) version_added: ~ type: string example: ~ - default: "airflow.api.auth.backend.default" + default: "airflow.api.auth.backend.deny_all" - name: lineage description: ~ options: @@ -699,6 +726,14 @@ type: string example: ~ default: "30" + - name: reload_on_plugin_change + description: | + If set to True, Airflow will track files in plugins_folder directory. When it detects changes, + then reload the gunicorn. + version_added: 1.10.11 + type: boolean + example: ~ + default: "False" - name: secret_key description: | Secret key used to run your flask app @@ -706,7 +741,7 @@ version_added: ~ type: string example: ~ - default: "temporary_key" + default: "{SECRET_KEY}" - name: workers description: | Number of workers to run the Gunicorn web server @@ -966,21 +1001,14 @@ type: string example: ~ default: "True" - - name: force_log_out_after - description: | - Minutes of non-activity before logged out from UI - 0 means never get forcibly logged out - version_added: 1.10.8 - type: string - example: ~ - default: "0" - - name: session_lifetime_days + - name: session_lifetime_minutes description: | - The UI cookie lifetime in days - version_added: 1.10.8 - type: string + The UI cookie lifetime in minutes. User will be logged out from UI after + ``session_lifetime_minutes`` of non-activity + version_added: 1.10.13 + type: int example: ~ - default: "30" + default: "43200" - name: email description: ~ @@ -1411,10 +1439,10 @@ type: string example: ~ default: "" - - name: max_threads + - name: parsing_processes description: | - The scheduler can run multiple threads in parallel to schedule dags. - This defines how many threads will run. + The scheduler can run multiple processes in parallel to parse dags. + This defines how many processes will run. version_added: ~ type: string example: ~ @@ -1736,6 +1764,14 @@ type: string example: ~ default: "" + - name: pod_template_file + description: | + Path to the YAML pod file. If set, all other kubernetes-related fields are ignored. + (This feature is experimental) + version_added: 1.10.11 + type: string + example: ~ + default: "" - name: worker_container_tag description: ~ version_added: ~ @@ -1750,11 +1786,19 @@ default: "IfNotPresent" - name: delete_worker_pods description: | - If True (default), worker pods will be deleted upon termination + If True, all worker pods will be deleted upon termination version_added: ~ type: string example: ~ default: "True" + - name: delete_worker_pods_on_failure + description: | + If False (and delete_worker_pods is True), + failed worker pods will not be deleted so users can investigate them. + version_added: 1.10.11 + type: string + example: ~ + default: "False" - name: worker_pods_creation_batch_size description: | Number of Kubernetes Worker Pod creation calls per scheduler loop @@ -1769,6 +1813,14 @@ type: string example: ~ default: "default" + - name: multi_namespace_mode + description: | + Allows users to launch pods in multiple namespaces. + Will require creating a cluster-role for the scheduler + version_added: 1.10.12 + type: boolean + example: ~ + default: "False" - name: airflow_configmap description: | The name of the Kubernetes ConfigMap containing the Airflow Configuration (this file) @@ -1819,6 +1871,13 @@ type: string example: ~ default: "" + - name: dags_volume_mount_point + description: | + For either git sync or volume mounted DAGs, the worker will mount the volume in this path + version_added: 1.10.11 + type: string + example: ~ + default: "" - name: dags_volume_claim description: | For DAGs mounted via a volume claim (mutually exclusive with git-sync and host path) @@ -1885,6 +1944,14 @@ type: string example: ~ default: "" + - name: git_sync_depth + description: | + Use a shallow clone with a history truncated to the specified number of commits. + 0 - do not use shallow clone. + version_added: 1.10.11 + type: string + example: ~ + default: "1" - name: git_subpath description: ~ version_added: ~ @@ -2106,13 +2173,24 @@ type: string example: ~ default: "" + - name: delete_option_kwargs + description: | + Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client + ``core_v1_api`` method when using the Kubernetes Executor. + This should be an object and can contain any of the options listed in the ``v1DeleteOptions`` + class defined here: + https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19 + version_added: 1.10.12 + type: string + example: '{"grace_period_seconds": 10}' + default: "" - name: run_as_user description: | Specifies the uid to run the first process of the worker pods containers as version_added: 1.10.3 type: string example: ~ - default: "" + default: "50000" - name: fs_group description: | Specifies a gid to associate with all containers in the worker pods diff --git a/airflow/config_templates/default_airflow.cfg b/airflow/config_templates/default_airflow.cfg index e4436c42ddac8..ff001b1cbeeb9 100644 --- a/airflow/config_templates/default_airflow.cfg +++ b/airflow/config_templates/default_airflow.cfg @@ -149,6 +149,12 @@ sql_alchemy_pool_pre_ping = True # SqlAlchemy supports databases with the concept of multiple schemas. sql_alchemy_schema = +# Import path for connect args in SqlAlchemy. Default to an empty dict. +# This is useful when you want to configure db engine args that SqlAlchemy won't parse +# in connection string. +# See https://docs.sqlalchemy.org/en/13/core/engines.html#sqlalchemy.create_engine.params.connect_args +# sql_alchemy_connect_args = + # The amount of parallelism as a setting to the executor. This defines # the max number of task instances that should run simultaneously # on this airflow installation @@ -236,11 +242,16 @@ store_serialized_dags = False # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate. min_serialized_dag_update_interval = 30 +# Fetching serialized DAG can not be faster than a minimum interval to reduce database +# read rate. This config controls when your DAGs are updated in the Webserver +min_serialized_dag_fetch_interval = 10 + # Whether to persist DAG files code in DB. # If set to True, Webserver reads file contents from DB instead of # trying to access files in a DAG folder. Defaults to same as the # ``store_serialized_dags`` setting. -store_dag_code = %(store_serialized_dags)s +# Example: store_dag_code = False +# store_dag_code = # Maximum number of Rendered Task Instance Fields (Template Fields) per task to store # in the Database. @@ -253,6 +264,10 @@ max_num_rendered_ti_fields_per_task = 30 # On each dagrun check against defined SLAs check_slas = True +# Path to custom XCom class that will be used to store and resolve operators results +# Example: xcom_backend = path.to.CustomXCom +xcom_backend = airflow.models.xcom.BaseXCom + [secrets] # Full class name of secrets backend to enable (will precede env vars and metastore in search path) # Example: backend = airflow.contrib.secrets.aws_systems_manager.SystemsManagerParameterStoreBackend @@ -281,8 +296,10 @@ endpoint_url = http://localhost:8080 fail_fast = False [api] -# How to authenticate users of the API -auth_backend = airflow.api.auth.backend.default +# How to authenticate users of the API. See +# https://airflow.apache.org/docs/stable/security.html for possible values. +# ("airflow.api.auth.backend.default" allows all requests for historic reasons) +auth_backend = airflow.api.auth.backend.deny_all [lineage] # what lineage backend to use #007A87 @@ -348,9 +365,13 @@ worker_refresh_batch_size = 1 # Number of seconds to wait before refreshing a batch of workers. worker_refresh_interval = 30 +# If set to True, Airflow will track files in plugins_folder directory. When it detects changes, +# then reload the gunicorn. +reload_on_plugin_change = False + # Secret key used to run your flask app # It should be as random as possible -secret_key = temporary_key +secret_key = {SECRET_KEY} # Number of workers to run the Gunicorn web server workers = 4 @@ -471,12 +492,9 @@ x_frame_enabled = True # on webserver startup update_fab_perms = True -# Minutes of non-activity before logged out from UI -# 0 means never get forcibly logged out -force_log_out_after = 0 - -# The UI cookie lifetime in days -session_lifetime_days = 30 +# The UI cookie lifetime in minutes. User will be logged out from UI after +# ``session_lifetime_minutes`` of non-activity +session_lifetime_minutes = 43200 [email] email_backend = airflow.utils.email.send_email_smtp @@ -685,9 +703,9 @@ statsd_prefix = airflow # start with the elements of the list (e.g: scheduler,executor,dagrun) statsd_allow_list = -# The scheduler can run multiple threads in parallel to schedule dags. -# This defines how many threads will run. -max_threads = 2 +# The scheduler can run multiple processes in parallel to parse dags. +# This defines how many processes will run. +parsing_processes = 2 authenticate = False # Turn off scheduler use of cron intervals by setting this to False. @@ -809,18 +827,30 @@ verify_certs = True [kubernetes] # The repository, tag and imagePullPolicy of the Kubernetes Image for the Worker to Run worker_container_repository = + +# Path to the YAML pod file. If set, all other kubernetes-related fields are ignored. +# (This feature is experimental) +pod_template_file = worker_container_tag = worker_container_image_pull_policy = IfNotPresent -# If True (default), worker pods will be deleted upon termination +# If True, all worker pods will be deleted upon termination delete_worker_pods = True +# If False (and delete_worker_pods is True), +# failed worker pods will not be deleted so users can investigate them. +delete_worker_pods_on_failure = False + # Number of Kubernetes Worker Pod creation calls per scheduler loop worker_pods_creation_batch_size = 1 # The Kubernetes namespace where airflow workers should be created. Defaults to ``default`` namespace = default +# Allows users to launch pods in multiple namespaces. +# Will require creating a cluster-role for the scheduler +multi_namespace_mode = False + # The name of the Kubernetes ConfigMap containing the Airflow Configuration (this file) # Example: airflow_configmap = airflow-configmap airflow_configmap = @@ -857,6 +887,9 @@ dags_in_image = False # For either git sync or volume mounted DAGs, the worker will look in this subpath for DAGs dags_volume_subpath = +# For either git sync or volume mounted DAGs, the worker will mount the volume in this path +dags_volume_mount_point = + # For DAGs mounted via a volume claim (mutually exclusive with git-sync and host path) dags_volume_claim = @@ -885,6 +918,10 @@ env_from_secret_ref = # Git credentials and repository for DAGs mounted via Git (mutually exclusive with volume claim) git_repo = git_branch = + +# Use a shallow clone with a history truncated to the specified number of commits. +# 0 - do not use shallow clone. +git_sync_depth = 1 git_subpath = # The specific rev or hash the git_sync init container will checkout @@ -1008,8 +1045,16 @@ tolerations = # The timeout is specified as [connect timeout, read timeout] kube_client_request_args = +# Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client +# ``core_v1_api`` method when using the Kubernetes Executor. +# This should be an object and can contain any of the options listed in the ``v1DeleteOptions`` +# class defined here: +# https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19 +# Example: delete_option_kwargs = {{"grace_period_seconds": 10}} +delete_option_kwargs = + # Specifies the uid to run the first process of the worker pods containers as -run_as_user = +run_as_user = 50000 # Specifies a gid to associate with all containers in the worker pods # if using a git_ssh_key_secret_name use an fs_group diff --git a/airflow/config_templates/default_celery.py b/airflow/config_templates/default_celery.py index 35a7c510ed810..00009da53e096 100644 --- a/airflow/config_templates/default_celery.py +++ b/airflow/config_templates/default_celery.py @@ -17,30 +17,28 @@ # specific language governing permissions and limitations # under the License. """Default celery configuration.""" +import logging import ssl from airflow.configuration import conf from airflow.exceptions import AirflowConfigException, AirflowException -from airflow.utils.log.logging_mixin import LoggingMixin def _broker_supports_visibility_timeout(url): return url.startswith("redis://") or url.startswith("sqs://") -log = LoggingMixin().log +log = logging.getLogger(__name__) broker_url = conf.get('celery', 'BROKER_URL') -broker_transport_options = conf.getsection( - 'celery_broker_transport_options' -) +broker_transport_options = conf.getsection('celery_broker_transport_options') or {} if 'visibility_timeout' not in broker_transport_options: if _broker_supports_visibility_timeout(broker_url): broker_transport_options['visibility_timeout'] = 21600 DEFAULT_CELERY_CONFIG = { - 'accept_content': ['json', 'pickle'], + 'accept_content': ['json'], 'event_serializer': 'json', 'worker_prefetch_multiplier': 1, 'task_acks_late': True, diff --git a/airflow/config_templates/default_test.cfg b/airflow/config_templates/default_test.cfg index 2ea4597198250..c93baa1ad1e6c 100644 --- a/airflow/config_templates/default_test.cfg +++ b/airflow/config_templates/default_test.cfg @@ -113,7 +113,7 @@ job_heartbeat_sec = 1 scheduler_heartbeat_sec = 5 scheduler_health_check_threshold = 30 authenticate = true -max_threads = 2 +parsing_processes = 2 catchup_by_default = True scheduler_zombie_task_threshold = 300 dag_dir_list_interval = 0 diff --git a/airflow/config_templates/default_webserver_config.py b/airflow/config_templates/default_webserver_config.py index bc8a6bb1fb613..33eef9f1a7d85 100644 --- a/airflow/config_templates/default_webserver_config.py +++ b/airflow/config_templates/default_webserver_config.py @@ -32,7 +32,7 @@ SQLALCHEMY_DATABASE_URI = conf.get('core', 'SQL_ALCHEMY_CONN') # Flask-WTF flag for CSRF -CSRF_ENABLED = True +WTF_CSRF_ENABLED = True # ---------------------------------------------------- # AUTHENTICATION CONFIG @@ -65,7 +65,6 @@ # Google OAuth example: # OAUTH_PROVIDERS = [{ # 'name':'google', -# 'whitelist': ['@YOU_COMPANY_DOMAIN'], # optional # 'token_key':'access_token', # 'icon':'fa-google', # 'remote_app': { diff --git a/airflow/configuration.py b/airflow/configuration.py index 5d47cedc8629c..8c33de47cde22 100644 --- a/airflow/configuration.py +++ b/airflow/configuration.py @@ -22,11 +22,14 @@ from __future__ import print_function from __future__ import unicode_literals +from base64 import b64encode from builtins import str from collections import OrderedDict import copy import errno from future import standard_library +import multiprocessing +import logging import os import shlex import six @@ -40,17 +43,18 @@ from zope.deprecation import deprecated from airflow.exceptions import AirflowConfigException -from airflow.utils.log.logging_mixin import LoggingMixin +from airflow.utils.module_loading import import_string standard_library.install_aliases() -log = LoggingMixin().log +log = logging.getLogger(__name__) # show Airflow's deprecation warnings -warnings.filterwarnings( - action='default', category=DeprecationWarning, module='airflow') -warnings.filterwarnings( - action='default', category=PendingDeprecationWarning, module='airflow') +if not sys.warnoptions: + warnings.filterwarnings( + action='default', category=DeprecationWarning, module='airflow') + warnings.filterwarnings( + action='default', category=PendingDeprecationWarning, module='airflow') def generate_fernet_key(): @@ -99,6 +103,15 @@ def run_command(command): return output +def _get_config_value_from_secret_backend(config_key): + """Get Config option values from Secret Backend""" + from airflow import secrets + secrets_client = secrets.get_custom_secret_backend() + if not secrets_client: + return None + return secrets_client.get_config(config_key) + + def _read_default_config_file(file_name): templates_dir = os.path.join(os.path.dirname(__file__), 'config_templates') file_path = os.path.join(templates_dir, file_name) @@ -133,7 +146,9 @@ class AirflowConfigParser(ConfigParser): # These configuration elements can be fetched as the stdout of commands # following the "{section}__{name}__cmd" pattern, the idea behind this # is to not store password on boxes in text files. - as_command_stdout = { + # These configs can also be fetched from Secrets backend + # following the "{section}__{name}__secret" pattern + sensitive_config_values = { ('core', 'sql_alchemy_conn'), ('core', 'fernet_key'), ('celery', 'broker_url'), @@ -169,6 +184,9 @@ class AirflowConfigParser(ConfigParser): 'json_format': 'elasticsearch_json_format', 'json_fields': 'elasticsearch_json_fields' + }, + 'scheduler': { + 'parsing_processes': 'max_threads' } } @@ -196,6 +214,35 @@ def __init__(self, default_config=None, *args, **kwargs): self.is_validated = False def _validate(self): + self._validate_config_dependencies() + for section, replacement in self.deprecated_values.items(): + for name, info in replacement.items(): + old, new, version = info + if self.get(section, name, fallback=None) == old: + # Make sure the env var option is removed, otherwise it + # would be read and used instead of the value we set + env_var = self._env_var_name(section, name) + os.environ.pop(env_var, None) + + self.set(section, name, new) + warnings.warn( + 'The {name} setting in [{section}] has the old default value ' + 'of {old!r}. This value has been changed to {new!r} in the ' + 'running config, but please update your config before Apache ' + 'Airflow {version}.'.format( + name=name, section=section, old=old, new=new, version=version + ), + FutureWarning + ) + + self.is_validated = True + + def _validate_config_dependencies(self): + """ + Validate that config values aren't invalid given other config values + or system-level limitations and requirements. + """ + if ( self.get("core", "executor") not in ('DebugExecutor', 'SequentialExecutor') and "sqlite" in self.get('core', 'sql_alchemy_conn')): @@ -221,27 +268,14 @@ def _validate(self): "error: attempt at using ldapgroup " "filtering without using the Ldap backend") - for section, replacement in self.deprecated_values.items(): - for name, info in replacement.items(): - old, new, version = info - if self.get(section, name, fallback=None) == old: - # Make sure the env var option is removed, otherwise it - # would be read and used instead of the value we set - env_var = self._env_var_name(section, name) - os.environ.pop(env_var, None) + if self.has_option('core', 'mp_start_method'): + mp_start_method = self.get('core', 'mp_start_method') + start_method_options = multiprocessing.get_all_start_methods() - self.set(section, name, new) - warnings.warn( - 'The {name} setting in [{section}] has the old default value ' - 'of {old!r}. This value has been changed to {new!r} in the ' - 'running config, but please update your config before Apache ' - 'Airflow {version}.'.format( - name=name, section=section, old=old, new=new, version=version - ), - FutureWarning - ) - - self.is_validated = True + if mp_start_method not in start_method_options: + raise AirflowConfigException( + "mp_start_method should not be " + mp_start_method + + ". Possible values are " + ", ".join(start_method_options)) @staticmethod def _env_var_name(section, key): @@ -256,19 +290,32 @@ def _get_env_var_option(self, section, key): env_var_cmd = env_var + '_CMD' if env_var_cmd in os.environ: # if this is a valid command key... - if (section, key) in self.as_command_stdout: + if (section, key) in self.sensitive_config_values: return run_command(os.environ[env_var_cmd]) + # alternatively AIRFLOW__{SECTION}__{KEY}_SECRET (to get from Secrets Backend) + env_var_secret_path = env_var + '_SECRET' + if env_var_secret_path in os.environ: + # if this is a valid secret path... + if (section, key) in self.sensitive_config_values: + return _get_config_value_from_secret_backend(os.environ[env_var_secret_path]) def _get_cmd_option(self, section, key): fallback_key = key + '_cmd' # if this is a valid command key... - if (section, key) in self.as_command_stdout: - if super(AirflowConfigParser, self) \ - .has_option(section, fallback_key): - command = super(AirflowConfigParser, self) \ - .get(section, fallback_key) + if (section, key) in self.sensitive_config_values: + if super(AirflowConfigParser, self).has_option(section, fallback_key): + command = super(AirflowConfigParser, self).get(section, fallback_key) return run_command(command) + def _get_secret_option(self, section, key): + """Get Config option values from Secret Backend""" + fallback_key = key + '_secret' + # if this is a valid secret key... + if (section, key) in self.sensitive_config_values: + if super(AirflowConfigParser, self).has_option(section, fallback_key): + secrets_path = super(AirflowConfigParser, self).get(section, fallback_key) + return _get_config_value_from_secret_backend(secrets_path) + def get(self, section, key, **kwargs): section = str(section).lower() key = str(key).lower() @@ -310,6 +357,16 @@ def get(self, section, key, **kwargs): self._warn_deprecate(section, key, deprecated_name) return option + # ...then from secret backends + option = self._get_secret_option(section, key) + if option: + return option + if deprecated_name: + option = self._get_secret_option(section, deprecated_name) + if option: + self._warn_deprecate(section, key, deprecated_name) + return option + # ...then the default config if self.airflow_defaults.has_option(section, key) or 'fallback' in kwargs: return expand_env_var( @@ -324,6 +381,26 @@ def get(self, section, key, **kwargs): "section/key [{section}/{key}] not found " "in config".format(section=section, key=key)) + def getimport(self, section, key, **kwargs): + """ + Reads options, imports the full qualified name, and returns the object. + In case of failure, it throws an exception a clear message with the key aad the section names + :return: The object or None, if the option is empty + """ + full_qualified_path = conf.get(section=section, key=key, **kwargs) + if not full_qualified_path: + return None + + try: + return import_string(full_qualified_path) + except ImportError as e: + log.error(e) + raise AirflowConfigException( + 'The object could not be loaded. Please check "{key}" key in "{section}" section. ' + 'Current value: "{full_qualified_path}".'.format( + key=key, section=section, full_qualified_path=full_qualified_path) + ) + def getboolean(self, section, key, **kwargs): val = str(self.get(section, key, **kwargs)).lower().strip() if '#' in val: @@ -393,7 +470,10 @@ def getsection(self, section): section_prefix = 'AIRFLOW__{S}__'.format(S=section.upper()) for env_var in sorted(os.environ.keys()): if env_var.startswith(section_prefix): - key = env_var.replace(section_prefix, '').lower() + key = env_var.replace(section_prefix, '') + if key.endswith("_CMD"): + key = key[:-4] + key = key.lower() _section[key] = self._get_env_var_option(section, key) for key, val in iteritems(_section): @@ -410,9 +490,22 @@ def getsection(self, section): _section[key] = val return _section + def write(self, fp, space_around_delimiters=True): + # This is based on the configparser.RawConfigParser.write method code to add support for + # reading options from environment variables. + if space_around_delimiters: + d = " {} ".format(self._delimiters[0]) # type: ignore + else: + d = self._delimiters[0] # type: ignore + if self._defaults: + self._write_section(fp, self.default_section, self._defaults.items(), d) # type: ignore + for section in self._sections: + self._write_section(fp, section, self.getsection(section).items(), d) # type: ignore + def as_dict( self, display_source=False, display_sensitive=False, raw=False, - include_env=True, include_cmds=True): + include_env=True, include_cmds=True, include_secret=True + ): """ Returns the current configuration as an OrderedDict of OrderedDicts. @@ -434,6 +527,12 @@ def as_dict( set (True, default), or should the _cmd options be left as the command to run (False) :type include_cmds: bool + :param include_secret: Should the result of calling any *_secret config be + set (True, default), or should the _secret options be left as the + path to get the secret from (False) + :type include_secret: bool + :return: Dictionary, where the key is the name of the section and the content is + the dictionary with the name of the parameter and its value. """ cfg = {} configs = [ @@ -475,7 +574,7 @@ def as_dict( # add bash commands if include_cmds: - for (section, key) in self.as_command_stdout: + for (section, key) in self.sensitive_config_values: opt = self._get_cmd_option(section, key) if opt: if not display_sensitive: @@ -487,6 +586,20 @@ def as_dict( cfg.setdefault(section, OrderedDict()).update({key: opt}) del cfg[section][key + '_cmd'] + # add config from secret backends + if include_secret: + for (section, key) in self.sensitive_config_values: + opt = self._get_secret_option(section, key) + if opt: + if not display_sensitive: + opt = '< hidden >' + if display_source: + opt = (opt, 'secret') + elif raw: + opt = opt.replace('%', '%%') + cfg.setdefault(section, OrderedDict()).update({key: opt}) + del cfg[section][key + '_secret'] + return cfg def load_test_config(self): @@ -594,6 +707,8 @@ def get_airflow_test_config(airflow_home): else: FERNET_KEY = '' +SECRET_KEY = b64encode(os.urandom(16)).decode('utf-8') + TEMPLATE_START = ( '# ----------------------- TEMPLATE BEGINS HERE -----------------------') if not os.path.isfile(TEST_CONFIG_FILE): diff --git a/airflow/contrib/auth/backends/github_enterprise_auth.py b/airflow/contrib/auth/backends/github_enterprise_auth.py index 28819222e5943..0cfd61751dc2c 100644 --- a/airflow/contrib/auth/backends/github_enterprise_auth.py +++ b/airflow/contrib/auth/backends/github_enterprise_auth.py @@ -16,6 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import logging import flask_login # Need to expose these downstream @@ -29,9 +30,8 @@ from airflow import models from airflow.configuration import AirflowConfigException, conf from airflow.utils.db import provide_session -from airflow.utils.log.logging_mixin import LoggingMixin -log = LoggingMixin().log +log = logging.getLogger(__name__) def get_config_param(param): diff --git a/airflow/contrib/auth/backends/google_auth.py b/airflow/contrib/auth/backends/google_auth.py index 2ce23ae56cd56..5554e46ed434c 100644 --- a/airflow/contrib/auth/backends/google_auth.py +++ b/airflow/contrib/auth/backends/google_auth.py @@ -16,6 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import logging import flask_login # Need to expose these downstream @@ -29,9 +30,8 @@ from airflow import models from airflow.configuration import conf from airflow.utils.db import provide_session -from airflow.utils.log.logging_mixin import LoggingMixin -log = LoggingMixin().log +log = logging.getLogger(__name__) def get_config_param(param): @@ -95,6 +95,10 @@ def init_app(self, flask_app): consumer_key=get_config_param('client_id'), consumer_secret=get_config_param('client_secret'), request_token_params={'scope': [ + 'https://www.googleapis.com/auth/userinfo.profile', + 'https://www.googleapis.com/auth/userinfo.email'], + 'prompt': get_config_param('prompt') + } if get_config_param('prompt') else {'scope': [ 'https://www.googleapis.com/auth/userinfo.profile', 'https://www.googleapis.com/auth/userinfo.email']}, base_url='https://www.google.com/accounts/', diff --git a/airflow/contrib/auth/backends/kerberos_auth.py b/airflow/contrib/auth/backends/kerberos_auth.py index e84a0b2aeb986..63cea8da3480c 100644 --- a/airflow/contrib/auth/backends/kerberos_auth.py +++ b/airflow/contrib/auth/backends/kerberos_auth.py @@ -20,7 +20,9 @@ import logging import flask_login from airflow.exceptions import AirflowConfigException -from flask_login import current_user +# Need to expose these downstream +# flake8: noqa: F401 +from flask_login import current_user, login_required, logout_user from flask import flash from wtforms import Form, PasswordField, StringField from wtforms.validators import InputRequired diff --git a/airflow/contrib/auth/backends/ldap_auth.py b/airflow/contrib/auth/backends/ldap_auth.py index 7368deae88dd8..c03410503e33a 100644 --- a/airflow/contrib/auth/backends/ldap_auth.py +++ b/airflow/contrib/auth/backends/ldap_auth.py @@ -17,33 +17,29 @@ # specific language governing permissions and limitations # under the License. from future.utils import native +import logging +import re +import ssl +import traceback import flask_login from flask_login import login_required, current_user, logout_user # noqa: F401 -from flask import flash +from flask import flash, redirect, url_for from wtforms import Form, PasswordField, StringField from wtforms.validators import InputRequired from ldap3 import Server, Connection, Tls, set_config_parameter, LEVEL, SUBTREE -import ssl - -from flask import url_for, redirect from airflow import models -from airflow.configuration import conf -from airflow.configuration import AirflowConfigException +from airflow.configuration import AirflowConfigException, conf from airflow.utils.db import provide_session -import traceback -import re - -from airflow.utils.log.logging_mixin import LoggingMixin LOGIN_MANAGER = flask_login.LoginManager() LOGIN_MANAGER.login_view = 'airflow.login' # Calls login() below LOGIN_MANAGER.login_message = None -log = LoggingMixin().log +log = logging.getLogger(__name__) class AuthenticationError(Exception): diff --git a/airflow/contrib/auth/backends/password_auth.py b/airflow/contrib/auth/backends/password_auth.py index e04a40d455b89..7b2b9efa46c24 100644 --- a/airflow/contrib/auth/backends/password_auth.py +++ b/airflow/contrib/auth/backends/password_auth.py @@ -20,6 +20,7 @@ from __future__ import unicode_literals import base64 +import logging from functools import wraps from sys import version_info @@ -41,14 +42,13 @@ from airflow import models from airflow.utils.db import provide_session, create_session -from airflow.utils.log.logging_mixin import LoggingMixin LOGIN_MANAGER = flask_login.LoginManager() LOGIN_MANAGER.login_view = 'airflow.login' # Calls login() below LOGIN_MANAGER.login_message = None -LOG = LoggingMixin().log PY3 = version_info[0] == 3 +log = logging.getLogger(__name__) CLIENT_AUTH = None @@ -119,7 +119,7 @@ def is_superuser(self): @provide_session def load_user(userid, session=None): """Loads user from the database""" - LOG.debug("Loading user %s", userid) + log.debug("Loading user %s", userid) if not userid or userid == 'None': return None @@ -151,7 +151,7 @@ def authenticate(session, username, password): if not user.authenticate(password): raise AuthenticationError() - LOG.info("User %s successfully authenticated", username) + log.info("User %s successfully authenticated", username) return user diff --git a/airflow/contrib/example_dags/example_dingding_operator.py b/airflow/contrib/example_dags/example_dingding_operator.py index dea5cef87feec..0072175de930b 100644 --- a/airflow/contrib/example_dags/example_dingding_operator.py +++ b/airflow/contrib/example_dags/example_dingding_operator.py @@ -172,7 +172,7 @@ def failure_callback(context): 'actionURL': 'http://airflow.apache.org' }, { - 'title': 'Airflow Github', + 'title': 'Airflow GitHub', 'actionURL': 'https://github.com/apache/airflow' } ] diff --git a/airflow/contrib/example_dags/example_kubernetes_operator.py b/airflow/contrib/example_dags/example_kubernetes_operator.py index e9453356bcff0..645b4b6e3ae0c 100644 --- a/airflow/contrib/example_dags/example_kubernetes_operator.py +++ b/airflow/contrib/example_dags/example_kubernetes_operator.py @@ -19,11 +19,13 @@ """ This is an example dag for using the KubernetesPodOperator. """ -from airflow.utils.dates import days_ago -from airflow.utils.log.logging_mixin import LoggingMixin + +import logging + from airflow.models import DAG +from airflow.utils.dates import days_ago -log = LoggingMixin().log +log = logging.getLogger(__name__) try: # Kubernetes is optional, so not available in vanilla Airflow @@ -64,6 +66,6 @@ ) except ImportError as e: - log.warning("Could not import KubernetesPodOperator: " + str(e)) - log.warning("Install kubernetes dependencies with: " + log.warning("Could not import KubernetesPodOperator: %s, ", str(e)) + log.warning("Install kubernetes dependencies with: \n" " pip install 'apache-airflow[kubernetes]'") diff --git a/airflow/contrib/executors/__init__.py b/airflow/contrib/executors/__init__.py index b7f8352944d3f..114d189da14ab 100644 --- a/airflow/contrib/executors/__init__.py +++ b/airflow/contrib/executors/__init__.py @@ -16,4 +16,3 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# diff --git a/airflow/contrib/executors/kubernetes_executor.py b/airflow/contrib/executors/kubernetes_executor.py index f180664c0bf45..17b5ef6ceb8e9 100644 --- a/airflow/contrib/executors/kubernetes_executor.py +++ b/airflow/contrib/executors/kubernetes_executor.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -14,933 +16,5 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Kubernetes executor""" -import base64 -import hashlib -from queue import Empty - -import re -import json -import multiprocessing -from uuid import uuid4 -import time - -from dateutil import parser - -import kubernetes -from kubernetes import watch, client -from kubernetes.client.rest import ApiException -from urllib3.exceptions import HTTPError, ReadTimeoutError - -from airflow.configuration import conf -from airflow.contrib.kubernetes.pod_launcher import PodLauncher -from airflow.contrib.kubernetes.kube_client import get_kube_client -from airflow.contrib.kubernetes.worker_configuration import WorkerConfiguration -from airflow.executors.base_executor import BaseExecutor -from airflow.executors import Executors -from airflow.models import KubeResourceVersion, KubeWorkerIdentifier, TaskInstance -from airflow.utils.state import State -from airflow.utils.db import provide_session, create_session -from airflow import settings -from airflow.exceptions import AirflowConfigException, AirflowException -from airflow.utils.log.logging_mixin import LoggingMixin - -MAX_POD_ID_LEN = 253 -MAX_LABEL_LEN = 63 - - -class KubernetesExecutorConfig: - def __init__(self, image=None, image_pull_policy=None, request_memory=None, - request_cpu=None, limit_memory=None, limit_cpu=None, limit_gpu=None, - gcp_service_account_key=None, node_selectors=None, affinity=None, - annotations=None, volumes=None, volume_mounts=None, tolerations=None, labels=None): - self.image = image - self.image_pull_policy = image_pull_policy - self.request_memory = request_memory - self.request_cpu = request_cpu - self.limit_memory = limit_memory - self.limit_cpu = limit_cpu - self.limit_gpu = limit_gpu - self.gcp_service_account_key = gcp_service_account_key - self.node_selectors = node_selectors - self.affinity = affinity - self.annotations = annotations - self.volumes = volumes - self.volume_mounts = volume_mounts - self.tolerations = tolerations - self.labels = labels or {} - - def __repr__(self): - return "{}(image={}, image_pull_policy={}, request_memory={}, request_cpu={}, " \ - "limit_memory={}, limit_cpu={}, limit_gpu={}, gcp_service_account_key={}, " \ - "node_selectors={}, affinity={}, annotations={}, volumes={}, " \ - "volume_mounts={}, tolerations={}, labels={})" \ - .format(KubernetesExecutorConfig.__name__, self.image, self.image_pull_policy, - self.request_memory, self.request_cpu, self.limit_memory, - self.limit_cpu, self.limit_gpu, self.gcp_service_account_key, self.node_selectors, - self.affinity, self.annotations, self.volumes, self.volume_mounts, - self.tolerations, self.labels) - - @staticmethod - def from_dict(obj): - if obj is None: - return KubernetesExecutorConfig() - - if not isinstance(obj, dict): - raise TypeError( - 'Cannot convert a non-dictionary object into a KubernetesExecutorConfig') - - namespaced = obj.get(Executors.KubernetesExecutor, {}) - - return KubernetesExecutorConfig( - image=namespaced.get('image', None), - image_pull_policy=namespaced.get('image_pull_policy', None), - request_memory=namespaced.get('request_memory', None), - request_cpu=namespaced.get('request_cpu', None), - limit_memory=namespaced.get('limit_memory', None), - limit_cpu=namespaced.get('limit_cpu', None), - limit_gpu=namespaced.get('limit_gpu', None), - gcp_service_account_key=namespaced.get('gcp_service_account_key', None), - node_selectors=namespaced.get('node_selectors', None), - affinity=namespaced.get('affinity', None), - annotations=namespaced.get('annotations', {}), - volumes=namespaced.get('volumes', []), - volume_mounts=namespaced.get('volume_mounts', []), - tolerations=namespaced.get('tolerations', None), - labels=namespaced.get('labels', {}), - ) - - def as_dict(self): - return { - 'image': self.image, - 'image_pull_policy': self.image_pull_policy, - 'request_memory': self.request_memory, - 'request_cpu': self.request_cpu, - 'limit_memory': self.limit_memory, - 'limit_cpu': self.limit_cpu, - 'limit_gpu': self.limit_gpu, - 'gcp_service_account_key': self.gcp_service_account_key, - 'node_selectors': self.node_selectors, - 'affinity': self.affinity, - 'annotations': self.annotations, - 'volumes': self.volumes, - 'volume_mounts': self.volume_mounts, - 'tolerations': self.tolerations, - 'labels': self.labels, - } - - -class KubeConfig: - """Configuration for Kubernetes""" - core_section = 'core' - kubernetes_section = 'kubernetes' - - def __init__(self): - configuration_dict = conf.as_dict(display_sensitive=True) - self.core_configuration = configuration_dict['core'] - self.kube_secrets = configuration_dict.get('kubernetes_secrets', {}) - self.kube_env_vars = configuration_dict.get('kubernetes_environment_variables', {}) - self.env_from_configmap_ref = conf.get(self.kubernetes_section, - 'env_from_configmap_ref') - self.env_from_secret_ref = conf.get(self.kubernetes_section, - 'env_from_secret_ref') - self.airflow_home = settings.AIRFLOW_HOME - self.dags_folder = conf.get(self.core_section, 'dags_folder') - self.parallelism = conf.getint(self.core_section, 'parallelism') - self.worker_container_repository = conf.get( - self.kubernetes_section, 'worker_container_repository') - self.worker_container_tag = conf.get( - self.kubernetes_section, 'worker_container_tag') - self.kube_image = '{}:{}'.format( - self.worker_container_repository, self.worker_container_tag) - self.kube_image_pull_policy = conf.get( - self.kubernetes_section, "worker_container_image_pull_policy" - ) - self.kube_node_selectors = configuration_dict.get('kubernetes_node_selectors', {}) - self.kube_annotations = configuration_dict.get('kubernetes_annotations', {}) - self.kube_labels = configuration_dict.get('kubernetes_labels', {}) - self.delete_worker_pods = conf.getboolean( - self.kubernetes_section, 'delete_worker_pods') - self.worker_pods_creation_batch_size = conf.getint( - self.kubernetes_section, 'worker_pods_creation_batch_size') - self.worker_service_account_name = conf.get( - self.kubernetes_section, 'worker_service_account_name') - self.image_pull_secrets = conf.get(self.kubernetes_section, 'image_pull_secrets') - - # NOTE: user can build the dags into the docker image directly, - # this will set to True if so - self.dags_in_image = conf.getboolean(self.kubernetes_section, 'dags_in_image') - - # Run as user for pod security context - self.worker_run_as_user = self._get_security_context_val('run_as_user') - self.worker_fs_group = self._get_security_context_val('fs_group') - - # NOTE: `git_repo` and `git_branch` must be specified together as a pair - # The http URL of the git repository to clone from - self.git_repo = conf.get(self.kubernetes_section, 'git_repo') - # The branch of the repository to be checked out - self.git_branch = conf.get(self.kubernetes_section, 'git_branch') - # Optionally, the directory in the git repository containing the dags - self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath') - # Optionally, the root directory for git operations - self.git_sync_root = conf.get(self.kubernetes_section, 'git_sync_root') - # Optionally, the name at which to publish the checked-out files under --root - self.git_sync_dest = conf.get(self.kubernetes_section, 'git_sync_dest') - # Optionally, the tag or hash to checkout - self.git_sync_rev = conf.get(self.kubernetes_section, 'git_sync_rev') - # Optionally, if git_dags_folder_mount_point is set the worker will use - # {git_dags_folder_mount_point}/{git_sync_dest}/{git_subpath} as dags_folder - self.git_dags_folder_mount_point = conf.get(self.kubernetes_section, - 'git_dags_folder_mount_point') - - # Optionally a user may supply a (`git_user` AND `git_password`) OR - # (`git_ssh_key_secret_name` AND `git_ssh_key_secret_key`) for private repositories - self.git_user = conf.get(self.kubernetes_section, 'git_user') - self.git_password = conf.get(self.kubernetes_section, 'git_password') - self.git_ssh_key_secret_name = conf.get(self.kubernetes_section, 'git_ssh_key_secret_name') - self.git_ssh_known_hosts_configmap_name = conf.get(self.kubernetes_section, - 'git_ssh_known_hosts_configmap_name') - self.git_sync_credentials_secret = conf.get(self.kubernetes_section, - 'git_sync_credentials_secret') - - # NOTE: The user may optionally use a volume claim to mount a PV containing - # DAGs directly - self.dags_volume_claim = conf.get(self.kubernetes_section, 'dags_volume_claim') - - # This prop may optionally be set for PV Claims and is used to write logs - self.logs_volume_claim = conf.get(self.kubernetes_section, 'logs_volume_claim') - - # This prop may optionally be set for PV Claims and is used to locate DAGs - # on a SubPath - self.dags_volume_subpath = conf.get( - self.kubernetes_section, 'dags_volume_subpath') - - # This prop may optionally be set for PV Claims and is used to locate logs - # on a SubPath - self.logs_volume_subpath = conf.get( - self.kubernetes_section, 'logs_volume_subpath') - - # Optionally, hostPath volume containing DAGs - self.dags_volume_host = conf.get(self.kubernetes_section, 'dags_volume_host') - - # Optionally, write logs to a hostPath Volume - self.logs_volume_host = conf.get(self.kubernetes_section, 'logs_volume_host') - - # This prop may optionally be set for PV Claims and is used to write logs - self.base_log_folder = conf.get(self.core_section, 'base_log_folder') - - # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note - # that if your - # cluster has RBAC enabled, your scheduler may need service account permissions to - # create, watch, get, and delete pods in this namespace. - self.kube_namespace = conf.get(self.kubernetes_section, 'namespace') - # The Kubernetes Namespace in which pods will be created by the executor. Note - # that if your - # cluster has RBAC enabled, your workers may need service account permissions to - # interact with cluster components. - self.executor_namespace = conf.get(self.kubernetes_section, 'namespace') - # Task secrets managed by KubernetesExecutor. - self.gcp_service_account_keys = conf.get(self.kubernetes_section, - 'gcp_service_account_keys') - - # If the user is using the git-sync container to clone their repository via git, - # allow them to specify repository, tag, and pod name for the init container. - self.git_sync_container_repository = conf.get( - self.kubernetes_section, 'git_sync_container_repository') - - self.git_sync_container_tag = conf.get( - self.kubernetes_section, 'git_sync_container_tag') - self.git_sync_container = '{}:{}'.format( - self.git_sync_container_repository, self.git_sync_container_tag) - - self.git_sync_init_container_name = conf.get( - self.kubernetes_section, 'git_sync_init_container_name') - - self.git_sync_run_as_user = self._get_security_context_val('git_sync_run_as_user') - - # The worker pod may optionally have a valid Airflow config loaded via a - # configmap - self.airflow_configmap = conf.get(self.kubernetes_section, 'airflow_configmap') - - # The worker pod may optionally have a valid Airflow local settings loaded via a - # configmap - self.airflow_local_settings_configmap = conf.get( - self.kubernetes_section, 'airflow_local_settings_configmap') - - affinity_json = conf.get(self.kubernetes_section, 'affinity') - if affinity_json: - self.kube_affinity = json.loads(affinity_json) - else: - self.kube_affinity = None - - tolerations_json = conf.get(self.kubernetes_section, 'tolerations') - if tolerations_json: - self.kube_tolerations = json.loads(tolerations_json) - else: - self.kube_tolerations = None - - kube_client_request_args = conf.get(self.kubernetes_section, 'kube_client_request_args') - if kube_client_request_args: - self.kube_client_request_args = json.loads(kube_client_request_args) - if self.kube_client_request_args['_request_timeout'] and \ - isinstance(self.kube_client_request_args['_request_timeout'], list): - self.kube_client_request_args['_request_timeout'] = \ - tuple(self.kube_client_request_args['_request_timeout']) - else: - self.kube_client_request_args = {} - self._validate() - - # pod security context items should return integers - # and only return a blank string if contexts are not set. - def _get_security_context_val(self, scontext): - val = conf.get(self.kubernetes_section, scontext) - if not val: - return 0 - else: - return int(val) - - def _validate(self): - # TODO: use XOR for dags_volume_claim and git_dags_folder_mount_point - if not self.dags_volume_claim \ - and not self.dags_volume_host \ - and not self.dags_in_image \ - and (not self.git_repo or not self.git_branch or not self.git_dags_folder_mount_point): - raise AirflowConfigException( - 'In kubernetes mode the following must be set in the `kubernetes` ' - 'config section: `dags_volume_claim` ' - 'or `dags_volume_host` ' - 'or `dags_in_image` ' - 'or `git_repo and git_branch and git_dags_folder_mount_point`') - if self.git_repo \ - and (self.git_user or self.git_password) \ - and self.git_ssh_key_secret_name: - raise AirflowConfigException( - 'In kubernetes mode, using `git_repo` to pull the DAGs: ' - 'for private repositories, either `git_user` and `git_password` ' - 'must be set for authentication through user credentials; ' - 'or `git_ssh_key_secret_name` must be set for authentication ' - 'through ssh key, but not both') - - -class KubernetesJobWatcher(multiprocessing.Process, LoggingMixin): - """Watches for Kubernetes jobs""" - def __init__(self, namespace, watcher_queue, resource_version, worker_uuid, kube_config): - multiprocessing.Process.__init__(self) - self.namespace = namespace - self.worker_uuid = worker_uuid - self.watcher_queue = watcher_queue - self.resource_version = resource_version - self.kube_config = kube_config - - def run(self): - """Performs watching""" - kube_client = get_kube_client() - while True: - try: - self.resource_version = self._run(kube_client, self.resource_version, - self.worker_uuid, self.kube_config) - except ReadTimeoutError: - self.log.warning("There was a timeout error accessing the Kube API. " - "Retrying request.", exc_info=True) - time.sleep(1) - except Exception: - self.log.exception('Unknown error in KubernetesJobWatcher. Failing') - raise - else: - self.log.warning('Watch died gracefully, starting back up with: ' - 'last resource_version: %s', self.resource_version) - - def _run(self, kube_client, resource_version, worker_uuid, kube_config): - self.log.info( - 'Event: and now my watch begins starting at resource_version: %s', - resource_version - ) - watcher = watch.Watch() - - kwargs = {'label_selector': 'airflow-worker={}'.format(worker_uuid)} - if resource_version: - kwargs['resource_version'] = resource_version - if kube_config.kube_client_request_args: - for key, value in kube_config.kube_client_request_args.items(): - kwargs[key] = value - - last_resource_version = None - for event in watcher.stream(kube_client.list_namespaced_pod, self.namespace, - **kwargs): - task = event['object'] - self.log.info( - 'Event: %s had an event of type %s', - task.metadata.name, event['type'] - ) - if event['type'] == 'ERROR': - return self.process_error(event) - self.process_status( - pod_id=task.metadata.name, - namespace=task.metadata.namespace, - status=task.status.phase, - labels=task.metadata.labels, - resource_version=task.metadata.resource_version, - event=event, - ) - last_resource_version = task.metadata.resource_version - - return last_resource_version - - def process_error(self, event): - """Process error response""" - self.log.error( - 'Encountered Error response from k8s list namespaced pod stream => %s', - event - ) - raw_object = event['raw_object'] - if raw_object['code'] == 410: - self.log.info( - 'Kubernetes resource version is too old, must reset to 0 => %s', - (raw_object['message'],) - ) - # Return resource version 0 - return '0' - raise AirflowException( - 'Kubernetes failure for %s with code %s and message: %s' % - (raw_object['reason'], raw_object['code'], raw_object['message']) - ) - - def process_status(self, pod_id, namespace, status, labels, resource_version, event): - """Process status response""" - if status == 'Pending': - if event['type'] == 'DELETED': - self.log.info('Event: Failed to start pod %s, will reschedule', pod_id) - self.watcher_queue.put((pod_id, namespace, State.UP_FOR_RESCHEDULE, labels, resource_version)) - else: - self.log.info('Event: %s Pending', pod_id) - elif status == 'Failed': - self.log.info('Event: %s Failed', pod_id) - self.watcher_queue.put((pod_id, namespace, State.FAILED, labels, resource_version)) - elif status == 'Succeeded': - self.log.info('Event: %s Succeeded', pod_id) - self.watcher_queue.put((pod_id, namespace, None, labels, resource_version)) - elif status == 'Running': - self.log.info('Event: %s is Running', pod_id) - else: - self.log.warning( - 'Event: Invalid state: %s on pod: %s in namespace %s with labels: %s with ' - 'resource_version: %s', status, pod_id, namespace, labels, resource_version - ) - - -class AirflowKubernetesScheduler(LoggingMixin): - """Airflow Scheduler for Kubernetes""" - def __init__(self, kube_config, task_queue, result_queue, kube_client, worker_uuid): - self.log.debug("Creating Kubernetes executor") - self.kube_config = kube_config - self.task_queue = task_queue - self.result_queue = result_queue - self.namespace = self.kube_config.kube_namespace - self.log.debug("Kubernetes using namespace %s", self.namespace) - self.kube_client = kube_client - self.launcher = PodLauncher(kube_client=self.kube_client) - self.worker_configuration = WorkerConfiguration(kube_config=self.kube_config) - self._manager = multiprocessing.Manager() - self.watcher_queue = self._manager.Queue() - self.worker_uuid = worker_uuid - self.kube_watcher = self._make_kube_watcher() - - def _make_kube_watcher(self): - resource_version = KubeResourceVersion.get_current_resource_version() - watcher = KubernetesJobWatcher(self.namespace, self.watcher_queue, - resource_version, self.worker_uuid, self.kube_config) - watcher.start() - return watcher - - def _health_check_kube_watcher(self): - if self.kube_watcher.is_alive(): - pass - else: - self.log.error( - 'Error while health checking kube watcher process. ' - 'Process died for unknown reasons') - self.kube_watcher = self._make_kube_watcher() - - def run_next(self, next_job): - """ - The run_next command will check the task_queue for any un-run jobs. - It will then create a unique job-id, launch that job in the cluster, - and store relevant info in the current_jobs map so we can track the job's - status - """ - self.log.info('Kubernetes job is %s', str(next_job)) - key, command, kube_executor_config = next_job - dag_id, task_id, execution_date, try_number = key - self.log.debug("Kubernetes running for command %s", command) - self.log.debug("Kubernetes launching image %s", self.kube_config.kube_image) - pod = self.worker_configuration.make_pod( - namespace=self.namespace, worker_uuid=self.worker_uuid, - pod_id=self._create_pod_id(dag_id, task_id), - dag_id=self._make_safe_label_value(dag_id), - task_id=self._make_safe_label_value(task_id), - try_number=try_number, - execution_date=self._datetime_to_label_safe_datestring(execution_date), - airflow_command=command, kube_executor_config=kube_executor_config - ) - # the watcher will monitor pods, so we do not block. - self.launcher.run_pod_async(pod, **self.kube_config.kube_client_request_args) - self.log.debug("Kubernetes Job created!") - - def delete_pod(self, pod_id, namespace): - """Deletes POD""" - try: - self.kube_client.delete_namespaced_pod( - pod_id, namespace, body=client.V1DeleteOptions(), - **self.kube_config.kube_client_request_args) - except ApiException as e: - # If the pod is already deleted - if e.status != 404: - raise - - def sync(self): - """ - The sync function checks the status of all currently running kubernetes jobs. - If a job is completed, it's status is placed in the result queue to - be sent back to the scheduler. - - :return: - - """ - self._health_check_kube_watcher() - while True: - try: - task = self.watcher_queue.get_nowait() - try: - self.process_watcher_task(task) - finally: - self.watcher_queue.task_done() - except Empty: - break - - def process_watcher_task(self, task): - """Process the task by watcher.""" - pod_id, namespace, state, labels, resource_version = task - self.log.info( - 'Attempting to finish pod; pod_id: %s; state: %s; labels: %s', - pod_id, state, labels - ) - key = self._labels_to_key(labels=labels) - if key: - self.log.debug('finishing job %s - %s (%s)', key, state, pod_id) - self.result_queue.put((key, state, pod_id, namespace, resource_version)) - - @staticmethod - def _strip_unsafe_kubernetes_special_chars(string): - """ - Kubernetes only supports lowercase alphanumeric characters and "-" and "." in - the pod name - However, there are special rules about how "-" and "." can be used so let's - only keep - alphanumeric chars see here for detail: - https://kubernetes.io/docs/concepts/overview/working-with-objects/names/ - - :param string: The requested Pod name - :return: ``str`` Pod name stripped of any unsafe characters - """ - return ''.join(ch.lower() for ind, ch in enumerate(string) if ch.isalnum()) - - @staticmethod - def _make_safe_pod_id(safe_dag_id, safe_task_id, safe_uuid): - """ - Kubernetes pod names must be <= 253 chars and must pass the following regex for - validation - ``^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`` - - :param safe_dag_id: a dag_id with only alphanumeric characters - :param safe_task_id: a task_id with only alphanumeric characters - :param safe_uuid: a uuid - :return: ``str`` valid Pod name of appropriate length - """ - safe_key = safe_dag_id + safe_task_id - - safe_pod_id = safe_key[:MAX_POD_ID_LEN - len(safe_uuid) - 1] + "-" + safe_uuid - - return safe_pod_id - - @staticmethod - def _make_safe_label_value(string): - """ - Valid label values must be 63 characters or less and must be empty or begin and - end with an alphanumeric character ([a-z0-9A-Z]) with dashes (-), underscores (_), - dots (.), and alphanumerics between. - - If the label value is then greater than 63 chars once made safe, or differs in any - way from the original value sent to this function, then we need to truncate to - 53chars, and append it with a unique hash. - """ - safe_label = re.sub(r'^[^a-z0-9A-Z]*|[^a-zA-Z0-9_\-\.]|[^a-z0-9A-Z]*$', '', string) - - if len(safe_label) > MAX_LABEL_LEN or string != safe_label: - safe_hash = hashlib.md5(string.encode()).hexdigest()[:9] - safe_label = safe_label[:MAX_LABEL_LEN - len(safe_hash) - 1] + "-" + safe_hash - - return safe_label - - @staticmethod - def _create_pod_id(dag_id, task_id): - safe_dag_id = AirflowKubernetesScheduler._strip_unsafe_kubernetes_special_chars( - dag_id) - safe_task_id = AirflowKubernetesScheduler._strip_unsafe_kubernetes_special_chars( - task_id) - safe_uuid = AirflowKubernetesScheduler._strip_unsafe_kubernetes_special_chars( - uuid4().hex) - return AirflowKubernetesScheduler._make_safe_pod_id(safe_dag_id, safe_task_id, - safe_uuid) - - @staticmethod - def _label_safe_datestring_to_datetime(string): - """ - Kubernetes doesn't permit ":" in labels. ISO datetime format uses ":" but not - "_", let's - replace ":" with "_" - - :param string: str - :return: datetime.datetime object - """ - return parser.parse(string.replace('_plus_', '+').replace("_", ":")) - - @staticmethod - def _datetime_to_label_safe_datestring(datetime_obj): - """ - Kubernetes doesn't like ":" in labels, since ISO datetime format uses ":" but - not "_" let's - replace ":" with "_" - - :param datetime_obj: datetime.datetime object - :return: ISO-like string representing the datetime - """ - return datetime_obj.isoformat().replace(":", "_").replace('+', '_plus_') - - def _labels_to_key(self, labels): - try_num = 1 - try: - try_num = int(labels.get('try_number', '1')) - except ValueError: - self.log.warning("could not get try_number as an int: %s", labels.get('try_number', '1')) - - try: - dag_id = labels['dag_id'] - task_id = labels['task_id'] - ex_time = self._label_safe_datestring_to_datetime(labels['execution_date']) - except Exception as e: - self.log.warning( - 'Error while retrieving labels; labels: %s; exception: %s', - labels, e - ) - return None - - with create_session() as session: - task = ( - session - .query(TaskInstance) - .filter_by(task_id=task_id, dag_id=dag_id, execution_date=ex_time) - .one_or_none() - ) - if task: - self.log.info( - 'Found matching task %s-%s (%s) with current state of %s', - task.dag_id, task.task_id, task.execution_date, task.state - ) - return (dag_id, task_id, ex_time, try_num) - else: - self.log.warning( - 'task_id/dag_id are not safe to use as Kubernetes labels. This can cause ' - 'severe performance regressions. Please see ' - '. ' - 'Given dag_id: %s, task_id: %s', task_id, dag_id - ) - - tasks = ( - session - .query(TaskInstance) - .filter_by(execution_date=ex_time).all() - ) - self.log.info( - 'Checking %s task instances.', - len(tasks) - ) - for task in tasks: - if ( - self._make_safe_label_value(task.dag_id) == dag_id and - self._make_safe_label_value(task.task_id) == task_id and - task.execution_date == ex_time - ): - self.log.info( - 'Found matching task %s-%s (%s) with current state of %s', - task.dag_id, task.task_id, task.execution_date, task.state - ) - dag_id = task.dag_id - task_id = task.task_id - return (dag_id, task_id, ex_time, try_num) - self.log.warning( - 'Failed to find and match task details to a pod; labels: %s', - labels - ) - return None - - def _flush_watcher_queue(self): - self.log.debug('Executor shutting down, watcher_queue approx. size=%d', self.watcher_queue.qsize()) - while True: - try: - task = self.watcher_queue.get_nowait() - # Ignoring it since it can only have either FAILED or SUCCEEDED pods - self.log.warning('Executor shutting down, IGNORING watcher task=%s', task) - self.watcher_queue.task_done() - except Empty: - break - - def terminate(self): - """Termninates the watcher.""" - self.log.debug("Terminating kube_watcher...") - self.kube_watcher.terminate() - self.kube_watcher.join() - self.log.debug("kube_watcher=%s", self.kube_watcher) - self.log.debug("Flushing watcher_queue...") - self._flush_watcher_queue() - # Queue should be empty... - self.watcher_queue.join() - self.log.debug("Shutting down manager...") - self._manager.shutdown() - - -class KubernetesExecutor(BaseExecutor, LoggingMixin): - """Executor for Kubernetes""" - def __init__(self): - self.kube_config = KubeConfig() - self.task_queue = None - self.result_queue = None - self.kube_scheduler = None - self.kube_client = None - self.worker_uuid = None - self._manager = multiprocessing.Manager() - super(KubernetesExecutor, self).__init__(parallelism=self.kube_config.parallelism) - - @provide_session - def clear_not_launched_queued_tasks(self, session=None): - """ - If the airflow scheduler restarts with pending "Queued" tasks, the tasks may or - may not - have been launched Thus, on starting up the scheduler let's check every - "Queued" task to - see if it has been launched (ie: if there is a corresponding pod on kubernetes) - - If it has been launched then do nothing, otherwise reset the state to "None" so - the task - will be rescheduled - - This will not be necessary in a future version of airflow in which there is - proper support - for State.LAUNCHED - """ - queued_tasks = session \ - .query(TaskInstance) \ - .filter(TaskInstance.state == State.QUEUED).all() - self.log.info( - 'When executor started up, found %s queued task instances', - len(queued_tasks) - ) - - for task in queued_tasks: - # noinspection PyProtectedMember - # pylint: disable=protected-access - dict_string = ( - "dag_id={},task_id={},execution_date={},airflow-worker={}".format( - AirflowKubernetesScheduler._make_safe_label_value(task.dag_id), - AirflowKubernetesScheduler._make_safe_label_value(task.task_id), - AirflowKubernetesScheduler._datetime_to_label_safe_datestring( - task.execution_date - ), - self.worker_uuid - ) - ) - # pylint: enable=protected-access - kwargs = dict(label_selector=dict_string) - if self.kube_config.kube_client_request_args: - for key, value in self.kube_config.kube_client_request_args.items(): - kwargs[key] = value - pod_list = self.kube_client.list_namespaced_pod( - self.kube_config.kube_namespace, **kwargs) - if not pod_list.items: - self.log.info( - 'TaskInstance: %s found in queued state but was not launched, ' - 'rescheduling', task - ) - session.query(TaskInstance).filter( - TaskInstance.dag_id == task.dag_id, - TaskInstance.task_id == task.task_id, - TaskInstance.execution_date == task.execution_date - ).update({TaskInstance.state: State.NONE}) - - def _inject_secrets(self): - def _create_or_update_secret(secret_name, secret_path): - try: - return self.kube_client.create_namespaced_secret( - self.kube_config.executor_namespace, kubernetes.client.V1Secret( - data={ - 'key.json': base64.b64encode(open(secret_path, 'r').read())}, - metadata=kubernetes.client.V1ObjectMeta(name=secret_name)), - **self.kube_config.kube_client_request_args) - except ApiException as e: - if e.status == 409: - return self.kube_client.replace_namespaced_secret( - secret_name, self.kube_config.executor_namespace, - kubernetes.client.V1Secret( - data={'key.json': base64.b64encode( - open(secret_path, 'r').read())}, - metadata=kubernetes.client.V1ObjectMeta(name=secret_name)), - **self.kube_config.kube_client_request_args) - self.log.exception( - 'Exception while trying to inject secret. ' - 'Secret name: %s, error details: %s', - secret_name, e - ) - raise - - # For each GCP service account key, inject it as a secret in executor - # namespace with the specific secret name configured in the airflow.cfg. - # We let exceptions to pass through to users. - if self.kube_config.gcp_service_account_keys: - name_path_pair_list = [ - {'name': account_spec.strip().split('=')[0], - 'path': account_spec.strip().split('=')[1]} - for account_spec in self.kube_config.gcp_service_account_keys.split(',')] - for service_account in name_path_pair_list: - _create_or_update_secret(service_account['name'], service_account['path']) - - def start(self): - """Starts the executor""" - self.log.info('Start Kubernetes executor') - self.worker_uuid = KubeWorkerIdentifier.get_or_create_current_kube_worker_uuid() - self.log.debug('Start with worker_uuid: %s', self.worker_uuid) - # always need to reset resource version since we don't know - # when we last started, note for behavior below - # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs - # /CoreV1Api.md#list_namespaced_pod - KubeResourceVersion.reset_resource_version() - self.task_queue = self._manager.Queue() - self.result_queue = self._manager.Queue() - self.kube_client = get_kube_client() - self.kube_scheduler = AirflowKubernetesScheduler( - self.kube_config, self.task_queue, self.result_queue, - self.kube_client, self.worker_uuid - ) - self._inject_secrets() - self.clear_not_launched_queued_tasks() - - def execute_async(self, key, command, queue=None, executor_config=None): - """Executes task asynchronously""" - self.log.info( - 'Add task %s with command %s with executor_config %s', - key, command, executor_config - ) - kube_executor_config = KubernetesExecutorConfig.from_dict(executor_config) - self.task_queue.put((key, command, kube_executor_config)) - - def sync(self): - """Synchronize task state.""" - if self.running: - self.log.debug('self.running: %s', self.running) - if self.queued_tasks: - self.log.debug('self.queued: %s', self.queued_tasks) - self.kube_scheduler.sync() - - last_resource_version = None - while True: - try: - results = self.result_queue.get_nowait() - try: - key, state, pod_id, namespace, resource_version = results - last_resource_version = resource_version - self.log.info('Changing state of %s to %s', results, state) - try: - self._change_state(key, state, pod_id, namespace) - except Exception as e: - self.log.exception('Exception: %s when attempting ' + - 'to change state of %s to %s, re-queueing.', e, results, state) - self.result_queue.put(results) - finally: - self.result_queue.task_done() - except Empty: - break - - KubeResourceVersion.checkpoint_resource_version(last_resource_version) - - for _ in range(self.kube_config.worker_pods_creation_batch_size): - try: - task = self.task_queue.get_nowait() - try: - self.kube_scheduler.run_next(task) - except ApiException as e: - self.log.warning('ApiException when attempting to run task, re-queueing. ' - 'Message: %s' % json.loads(e.body)['message']) - self.task_queue.put(task) - except HTTPError as e: - self.log.warning('HTTPError when attempting to run task, re-queueing. ' - 'Exception: %s', str(e)) - self.task_queue.put(task) - finally: - self.task_queue.task_done() - except Empty: - break - - def _change_state(self, key, state, pod_id, namespace): - if state != State.RUNNING: - if self.kube_config.delete_worker_pods: - self.kube_scheduler.delete_pod(pod_id, namespace) - self.log.info('Deleted pod: %s in namespace %s', str(key), str(namespace)) - try: - self.running.pop(key) - except KeyError: - self.log.debug('Could not find key: %s', str(key)) - self.event_buffer[key] = state - - def _flush_task_queue(self): - self.log.debug('Executor shutting down, task_queue approximate size=%d', self.task_queue.qsize()) - while True: - try: - task = self.task_queue.get_nowait() - # This is a new task to run thus ok to ignore. - self.log.warning('Executor shutting down, will NOT run task=%s', task) - self.task_queue.task_done() - except Empty: - break - - def _flush_result_queue(self): - self.log.debug('Executor shutting down, result_queue approximate size=%d', self.result_queue.qsize()) - while True: # pylint: disable=too-many-nested-blocks - try: - results = self.result_queue.get_nowait() - self.log.warning('Executor shutting down, flushing results=%s', results) - try: - key, state, pod_id, namespace, resource_version = results - self.log.info('Changing state of %s to %s : resource_version=%d', results, state, - resource_version) - try: - self._change_state(key, state, pod_id, namespace) - except Exception as e: # pylint: disable=broad-except - self.log.exception('Ignoring exception: %s when attempting to change state of %s ' - 'to %s.', e, results, state) - finally: - self.result_queue.task_done() - except Empty: - break - def end(self): - """Called when the executor shuts down""" - self.log.info('Shutting down Kubernetes executor') - self.log.debug('Flushing task_queue...') - self._flush_task_queue() - self.log.debug('Flushing result_queue...') - self._flush_result_queue() - # Both queues should be empty... - self.task_queue.join() - self.result_queue.join() - if self.kube_scheduler: - self.kube_scheduler.terminate() - self._manager.shutdown() +from airflow.executors.kubernetes_executor import KubernetesExecutor # noqa diff --git a/airflow/contrib/hooks/bigquery_hook.py b/airflow/contrib/hooks/bigquery_hook.py index 4948ca4263135..e99aa73bdb2d2 100644 --- a/airflow/contrib/hooks/bigquery_hook.py +++ b/airflow/contrib/hooks/bigquery_hook.py @@ -21,9 +21,9 @@ This module contains a BigQuery Hook, as well as a very basic PEP 249 implementation for BigQuery. """ - -import time +import logging import six +import time from builtins import range from copy import deepcopy from six import iteritems @@ -43,6 +43,8 @@ _test_google_api_imports as gbq_test_google_api_imports from pandas_gbq.gbq import GbqConnector +log = logging.getLogger(__name__) + class BigQueryHook(GoogleCloudBaseHook, DbApiHook): """ @@ -83,7 +85,7 @@ def get_service(self): return build( 'bigquery', 'v2', http=http_authorized, cache_discovery=False) - def insert_rows(self, table, rows, target_fields=None, commit_every=1000): + def insert_rows(self, table, rows, target_fields=None, commit_every=1000, **kwargs): """ Insertion is currently unsupported. Theoretically, you could use BigQuery's streaming API to insert rows into a table, but this hasn't @@ -91,7 +93,7 @@ def insert_rows(self, table, rows, target_fields=None, commit_every=1000): """ raise NotImplementedError() - def get_pandas_df(self, sql, parameters=None, dialect=None): + def get_pandas_df(self, sql, parameters=None, dialect=None, **kwargs): """ Returns a Pandas DataFrame for the results produced by a BigQuery query. The DbApiHook method must be overridden because Pandas @@ -108,6 +110,8 @@ def get_pandas_df(self, sql, parameters=None, dialect=None): :param dialect: Dialect of BigQuery SQL – legacy SQL or standard SQL defaults to use `self.use_legacy_sql` if not specified :type dialect: str in {'legacy', 'standard'} + :param kwargs: (optional) passed into pandas_gbq.read_gbq method + :type kwargs: dict """ private_key = self._get_field('key_path', None) or self._get_field('keyfile_dict', None) @@ -118,7 +122,8 @@ def get_pandas_df(self, sql, parameters=None, dialect=None): project_id=self._get_field('project'), dialect=dialect, verbose=False, - private_key=private_key) + private_key=private_key, + **kwargs) def table_exists(self, project_id, dataset_id, table_id): """ @@ -2233,7 +2238,6 @@ def var_print(var_name): if project_id is None: if var_name is not None: - log = LoggingMixin().log log.info( 'Project not included in %s: %s; using project "%s"', var_name, table_input, default_project_id diff --git a/airflow/contrib/hooks/cloudant_hook.py b/airflow/contrib/hooks/cloudant_hook.py index 5d39f3fa8af1e..b160ae4987004 100644 --- a/airflow/contrib/hooks/cloudant_hook.py +++ b/airflow/contrib/hooks/cloudant_hook.py @@ -16,6 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import logging from past.builtins import unicode @@ -23,7 +24,8 @@ from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook -from airflow.utils.log.logging_mixin import LoggingMixin + +log = logging.getLogger(__name__) class CloudantHook(BaseHook): @@ -40,7 +42,6 @@ def get_conn(self): def _str(s): # cloudant-python doesn't support unicode. if isinstance(s, unicode): - log = LoggingMixin().log log.debug( 'cloudant-python does not support unicode. Encoding %s as ' 'ascii using "ignore".', s diff --git a/airflow/contrib/hooks/gcp_api_base_hook.py b/airflow/contrib/hooks/gcp_api_base_hook.py index 3ba68e25a771f..92899c34c01c0 100644 --- a/airflow/contrib/hooks/gcp_api_base_hook.py +++ b/airflow/contrib/hooks/gcp_api_base_hook.py @@ -35,11 +35,11 @@ import tenacity from googleapiclient.http import set_user_agent -from airflow import LoggingMixin, version +from airflow import version from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook -logger = LoggingMixin().log +log = logging.getLogger(__name__) _DEFAULT_SCOPES = ('https://www.googleapis.com/auth/cloud-platform',) @@ -234,8 +234,8 @@ def decorator(fun): default_kwargs = { 'wait': tenacity.wait_exponential(multiplier=1, max=100), 'retry': retry_if_temporary_quota(), - 'before': tenacity.before_log(logger, logging.DEBUG), - 'after': tenacity.after_log(logger, logging.DEBUG), + 'before': tenacity.before_log(log, logging.DEBUG), + 'after': tenacity.after_log(log, logging.DEBUG), } default_kwargs.update(**kwargs) return tenacity.retry( diff --git a/airflow/contrib/hooks/gcp_dataproc_hook.py b/airflow/contrib/hooks/gcp_dataproc_hook.py index aae4e7aaa0f09..dec36187e3f5a 100644 --- a/airflow/contrib/hooks/gcp_dataproc_hook.py +++ b/airflow/contrib/hooks/gcp_dataproc_hook.py @@ -337,7 +337,7 @@ def cancel(self, project_id, job_id, region='global'): projectId=project_id, region=region, jobId=job_id - ) + ).execute(num_retries=self.num_retries) setattr( diff --git a/airflow/contrib/hooks/gcp_mlengine_hook.py b/airflow/contrib/hooks/gcp_mlengine_hook.py index 2ca0bf5502691..edf51e00820e5 100644 --- a/airflow/contrib/hooks/gcp_mlengine_hook.py +++ b/airflow/contrib/hooks/gcp_mlengine_hook.py @@ -14,18 +14,18 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +import logging import random import time from googleapiclient.errors import HttpError from googleapiclient.discovery import build from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from airflow.utils.log.logging_mixin import LoggingMixin + +log = logging.getLogger(__name__) def _poll_with_exponential_delay(request, max_n, is_done_func, is_error_func): - log = LoggingMixin().log for i in range(0, max_n): try: diff --git a/airflow/contrib/hooks/qubole_check_hook.py b/airflow/contrib/hooks/qubole_check_hook.py index 303c19b648e24..137bab7e84545 100644 --- a/airflow/contrib/hooks/qubole_check_hook.py +++ b/airflow/contrib/hooks/qubole_check_hook.py @@ -17,7 +17,8 @@ # specific language governing permissions and limitations # under the License. # -from airflow.utils.log.logging_mixin import LoggingMixin +import logging + from airflow.contrib.hooks.qubole_hook import QuboleHook from airflow.exceptions import AirflowException from qds_sdk.commands import Command @@ -32,6 +33,9 @@ ROW_DELIM = '\r\n' +log = logging.getLogger(__name__) + + def isint(value): try: int(value) @@ -92,7 +96,6 @@ def handle_failure_retry(context): cmd = Command.find(cmd_id) if cmd is not None: if cmd.status == 'running': - log = LoggingMixin().log log.info('Cancelling the Qubole Command Id: %s', cmd_id) cmd.cancel() @@ -104,7 +107,6 @@ def get_first(self, sql): return record_list def get_query_results(self): - log = LoggingMixin().log if self.cmd is not None: cmd_id = self.cmd.id log.info("command id: " + str(cmd_id)) diff --git a/airflow/contrib/hooks/qubole_hook.py b/airflow/contrib/hooks/qubole_hook.py index 37433a2cb1b0d..9facd2ca4fd44 100644 --- a/airflow/contrib/hooks/qubole_hook.py +++ b/airflow/contrib/hooks/qubole_hook.py @@ -18,6 +18,7 @@ # under the License. # """Qubole hook""" +import logging import os import time import datetime @@ -31,9 +32,11 @@ from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook from airflow.configuration import conf, mkdir_p -from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.state import State +log = logging.getLogger(__name__) + + COMMAND_CLASSES = { "hivecmd": HiveCommand, "prestocmd": PrestoCommand, @@ -117,7 +120,6 @@ def handle_failure_retry(context): if cmd_id is not None: cmd = Command.find(cmd_id) if cmd is not None: - log = LoggingMixin().log if cmd.status == 'done': log.info('Command ID: %s has been succeeded, hence marking this ' 'TI as Success.', cmd_id) diff --git a/airflow/contrib/hooks/salesforce_hook.py b/airflow/contrib/hooks/salesforce_hook.py index a1756b6530b6a..075941868c797 100644 --- a/airflow/contrib/hooks/salesforce_hook.py +++ b/airflow/contrib/hooks/salesforce_hook.py @@ -26,15 +26,17 @@ NOTE: this hook also relies on the simple_salesforce package: https://github.com/simple-salesforce/simple-salesforce """ +import json +import logging +import time + from simple_salesforce import Salesforce from airflow.hooks.base_hook import BaseHook -import json - import pandas as pd -import time -from airflow.utils.log.logging_mixin import LoggingMixin + +log = logging.getLogger(__name__) class SalesforceHook(BaseHook): @@ -176,7 +178,6 @@ def _to_timestamp(cls, col): try: col = pd.to_datetime(col) except ValueError: - log = LoggingMixin().log log.warning( "Could not convert field to timestamps: %s", col.name ) diff --git a/airflow/contrib/hooks/slack_webhook_hook.py b/airflow/contrib/hooks/slack_webhook_hook.py index f3817b3772e24..001973a80d208 100644 --- a/airflow/contrib/hooks/slack_webhook_hook.py +++ b/airflow/contrib/hooks/slack_webhook_hook.py @@ -58,6 +58,8 @@ class SlackWebhookHook(HttpHook): :type link_names: bool :param proxy: Proxy to use to make the Slack webhook call :type proxy: str + :param extra_options: Extra options for http hook + :type extra_options: dict """ def __init__(self, @@ -72,6 +74,7 @@ def __init__(self, icon_url=None, link_names=False, proxy=None, + extra_options=None, *args, **kwargs ): @@ -86,6 +89,7 @@ def __init__(self, self.icon_url = icon_url self.link_names = link_names self.proxy = proxy + self.extra_options = extra_options or {} def _get_token(self, token, http_conn_id): """ @@ -140,13 +144,13 @@ def execute(self): """ Remote Popen (actually execute the slack webhook call) """ - proxies = {} + if self.proxy: # we only need https proxy for Slack, as the endpoint is https - proxies = {'https': self.proxy} + self.extra_options.update({'proxies': {'https': self.proxy}}) slack_message = self._build_slack_message() self.run(endpoint=self.webhook_token, data=slack_message, headers={'Content-type': 'application/json'}, - extra_options={'proxies': proxies}) + extra_options=self.extra_options) diff --git a/airflow/contrib/hooks/snowflake_hook.py b/airflow/contrib/hooks/snowflake_hook.py index cd6c1c9af30f3..8574336687249 100644 --- a/airflow/contrib/hooks/snowflake_hook.py +++ b/airflow/contrib/hooks/snowflake_hook.py @@ -44,6 +44,7 @@ def __init__(self, *args, **kwargs): self.region = kwargs.pop("region", None) self.role = kwargs.pop("role", None) self.schema = kwargs.pop("schema", None) + self.authenticator = kwargs.pop("authenticator", None) def _get_conn_params(self): """ @@ -56,6 +57,7 @@ def _get_conn_params(self): database = conn.extra_dejson.get('database', None) region = conn.extra_dejson.get("region", None) role = conn.extra_dejson.get('role', None) + authenticator = conn.extra_dejson.get('authenticator', 'snowflake') conn_config = { "user": conn.login, @@ -65,8 +67,8 @@ def _get_conn_params(self): "account": self.account or account or '', "warehouse": self.warehouse or warehouse or '', "region": self.region or region or '', - "role": self.role or role or '' - + "role": self.role or role, + "authenticator": self.authenticator or authenticator } """ @@ -103,7 +105,7 @@ def get_uri(self): """ conn_config = self._get_conn_params() uri = 'snowflake://{user}:{password}@{account}/{database}/' - uri += '{schema}?warehouse={warehouse}&role={role}' + uri += '{schema}?warehouse={warehouse}&role={role}&authenticator={authenticator}' return uri.format(**conn_config) def get_conn(self): diff --git a/airflow/contrib/hooks/spark_jdbc_hook.py b/airflow/contrib/hooks/spark_jdbc_hook.py index c188b1e863de2..a295fa3ecb695 100644 --- a/airflow/contrib/hooks/spark_jdbc_hook.py +++ b/airflow/contrib/hooks/spark_jdbc_hook.py @@ -144,7 +144,7 @@ def __init__(self, super(SparkJDBCHook, self).__init__(*args, **kwargs) self._name = spark_app_name self._conn_id = spark_conn_id - self._conf = spark_conf + self._conf = spark_conf or {} self._py_files = spark_py_files self._files = spark_files self._jars = spark_jars diff --git a/airflow/contrib/hooks/spark_submit_hook.py b/airflow/contrib/hooks/spark_submit_hook.py index 32bbf91de0d65..8fd6c69f026fd 100644 --- a/airflow/contrib/hooks/spark_submit_hook.py +++ b/airflow/contrib/hooks/spark_submit_hook.py @@ -27,7 +27,7 @@ from airflow.utils.log.logging_mixin import LoggingMixin try: - from airflow.contrib.kubernetes import kube_client + from airflow.kubernetes import kube_client except ImportError: pass diff --git a/airflow/contrib/kubernetes/__init__.py b/airflow/contrib/kubernetes/__init__.py index 13a83393a9124..b7f8352944d3f 100644 --- a/airflow/contrib/kubernetes/__init__.py +++ b/airflow/contrib/kubernetes/__init__.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -14,3 +16,4 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# diff --git a/airflow/contrib/kubernetes/kube_client.py b/airflow/contrib/kubernetes/kube_client.py index ab37f1decf926..d785fac73a9db 100644 --- a/airflow/contrib/kubernetes/kube_client.py +++ b/airflow/contrib/kubernetes/kube_client.py @@ -14,91 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Client for kubernetes communication""" +"""This module is deprecated. Please use `airflow.kubernetes.kube_client`.""" -from typing import Optional +import warnings -from airflow.configuration import conf -from six import PY2 +# pylint: disable=unused-import +from airflow.kubernetes.kube_client import * # noqa -try: - from kubernetes import config, client - from kubernetes.client.rest import ApiException # pylint: disable=unused-import - from kubernetes.client.api_client import ApiClient - from kubernetes.client import Configuration - from airflow.contrib.kubernetes.refresh_config import ( # pylint: disable=ungrouped-imports - load_kube_config, - RefreshConfiguration, - ) - has_kubernetes = True - - def _get_kube_config(in_cluster, # type: bool - cluster_context, # type: Optional[str] - config_file, # type: Optional[str] - ): # type: (...) -> Optional[Configuration] - if in_cluster: - # load_incluster_config set default configuration with config populated by k8s - config.load_incluster_config() - cfg = None - else: - # this block can be replaced with just config.load_kube_config once - # refresh_config module is replaced with upstream fix - cfg = RefreshConfiguration() - load_kube_config( - client_configuration=cfg, config_file=config_file, context=cluster_context) - - if PY2: - # For connect_get_namespaced_pod_exec - configuration = Configuration() - configuration.assert_hostname = False - Configuration.set_default(configuration) - return cfg - - def _get_client_with_patched_configuration(cfg): # type (Optional[Configuration]) -> client.CoreV1Api: - ''' - This is a workaround for supporting api token refresh in k8s client. - - The function can be replace with `return client.CoreV1Api()` once the - upstream client supports token refresh. - ''' - if cfg: - return client.CoreV1Api(api_client=ApiClient(configuration=cfg)) - else: - return client.CoreV1Api() - -except ImportError as e: - # We need an exception class to be able to use it in ``except`` elsewhere - # in the code base - ApiException = BaseException - has_kubernetes = False - _import_err = e - - -def get_kube_client(in_cluster=conf.getboolean('kubernetes', 'in_cluster'), # type: bool - cluster_context=None, # type: Optional[str] - config_file=None, # type: Optional[str] - ): - """ - Retrieves Kubernetes client - - :param in_cluster: whether we are in cluster - :type in_cluster: bool - :param cluster_context: context of the cluster - :type cluster_context: str - :param config_file: configuration file - :type config_file: str - :return kubernetes client - :rtype client.CoreV1Api - """ - - if not has_kubernetes: - raise _import_err - - if not in_cluster: - if cluster_context is None: - cluster_context = conf.get('kubernetes', 'cluster_context', fallback=None) - if config_file is None: - config_file = conf.get('kubernetes', 'config_file', fallback=None) - - client_conf = _get_kube_config(in_cluster, cluster_context, config_file) - return _get_client_with_patched_configuration(client_conf) +warnings.warn( + "This module is deprecated. Please use `airflow.kubernetes.kube_client`.", + DeprecationWarning, stacklevel=2 +) diff --git a/airflow/contrib/kubernetes/kubernetes_request_factory/kubernetes_request_factory.py b/airflow/contrib/kubernetes/kubernetes_request_factory/kubernetes_request_factory.py deleted file mode 100644 index ad5851099be96..0000000000000 --- a/airflow/contrib/kubernetes/kubernetes_request_factory/kubernetes_request_factory.py +++ /dev/null @@ -1,258 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from abc import ABCMeta, abstractmethod -import six - - -class KubernetesRequestFactory: - """ - Create requests to be sent to kube API. - Extend this class to talk to kubernetes and generate your specific resources. - This is equivalent of generating yaml files that can be used by `kubectl` - """ - __metaclass__ = ABCMeta - - @abstractmethod - def create(self, pod): - """ - Creates the request for kubernetes API. - - :param pod: The pod object - """ - - @staticmethod - def extract_image(pod, req): - req['spec']['containers'][0]['image'] = pod.image - - @staticmethod - def extract_image_pull_policy(pod, req): - if pod.image_pull_policy: - req['spec']['containers'][0]['imagePullPolicy'] = pod.image_pull_policy - - @staticmethod - def add_secret_to_env(env, secret): - env.append({ - 'name': secret.deploy_target, - 'valueFrom': { - 'secretKeyRef': { - 'name': secret.secret, - 'key': secret.key - } - } - }) - - @staticmethod - def add_runtime_info_env(env, runtime_info): - env.append({ - 'name': runtime_info.name, - 'valueFrom': { - 'fieldRef': { - 'fieldPath': runtime_info.field_path - } - } - }) - - @staticmethod - def extract_labels(pod, req): - req['metadata']['labels'] = req['metadata'].get('labels', {}) - for k, v in six.iteritems(pod.labels): - req['metadata']['labels'][k] = v - - @staticmethod - def extract_annotations(pod, req): - req['metadata']['annotations'] = req['metadata'].get('annotations', {}) - for k, v in six.iteritems(pod.annotations): - req['metadata']['annotations'][k] = v - - @staticmethod - def extract_affinity(pod, req): - req['spec']['affinity'] = req['spec'].get('affinity', {}) - for k, v in six.iteritems(pod.affinity): - req['spec']['affinity'][k] = v - - @staticmethod - def extract_node_selector(pod, req): - req['spec']['nodeSelector'] = req['spec'].get('nodeSelector', {}) - for k, v in six.iteritems(pod.node_selectors): - req['spec']['nodeSelector'][k] = v - - @staticmethod - def extract_cmds(pod, req): - req['spec']['containers'][0]['command'] = pod.cmds - - @staticmethod - def extract_args(pod, req): - req['spec']['containers'][0]['args'] = pod.args - - @staticmethod - def attach_ports(pod, req): - req['spec']['containers'][0]['ports'] = ( - req['spec']['containers'][0].get('ports', [])) - if len(pod.ports) > 0: - req['spec']['containers'][0]['ports'].extend(pod.ports) - - @staticmethod - def attach_volumes(pod, req): - req['spec']['volumes'] = ( - req['spec'].get('volumes', [])) - if len(pod.volumes) > 0: - req['spec']['volumes'].extend(pod.volumes) - - @staticmethod - def attach_volume_mounts(pod, req): - if len(pod.volume_mounts) > 0: - req['spec']['containers'][0]['volumeMounts'] = ( - req['spec']['containers'][0].get('volumeMounts', [])) - req['spec']['containers'][0]['volumeMounts'].extend(pod.volume_mounts) - - @staticmethod - def extract_name(pod, req): - req['metadata']['name'] = pod.name - - @staticmethod - def extract_volume_secrets(pod, req): - vol_secrets = [s for s in pod.secrets if s.deploy_type == 'volume'] - if any(vol_secrets): - req['spec']['containers'][0]['volumeMounts'] = ( - req['spec']['containers'][0].get('volumeMounts', [])) - req['spec']['volumes'] = ( - req['spec'].get('volumes', [])) - for idx, vol in enumerate(vol_secrets): - vol_id = 'secretvol' + str(idx) - req['spec']['containers'][0]['volumeMounts'].append({ - 'mountPath': vol.deploy_target, - 'name': vol_id, - 'readOnly': True - }) - req['spec']['volumes'].append({ - 'name': vol_id, - 'secret': { - 'secretName': vol.secret - } - }) - - @staticmethod - def extract_env_and_secrets(pod, req): - envs_from_key_secrets = [ - env for env in pod.secrets if env.deploy_type == 'env' and env.key is not None - ] - - if len(pod.envs) > 0 or len(envs_from_key_secrets) > 0 or len(pod.pod_runtime_info_envs) > 0: - env = [] - for runtime_info in pod.pod_runtime_info_envs: - KubernetesRequestFactory.add_runtime_info_env(env, runtime_info) - for k in pod.envs.keys(): - env.append({'name': k, 'value': pod.envs[k]}) - for secret in envs_from_key_secrets: - KubernetesRequestFactory.add_secret_to_env(env, secret) - - req['spec']['containers'][0]['env'] = env - - KubernetesRequestFactory._apply_env_from(pod, req) - - @staticmethod - def extract_resources(pod, req): - if not pod.resources or pod.resources.is_empty_resource_request(): - return - - req['spec']['containers'][0]['resources'] = {} - - if pod.resources.has_requests(): - req['spec']['containers'][0]['resources']['requests'] = {} - if pod.resources.request_memory: - req['spec']['containers'][0]['resources']['requests'][ - 'memory'] = pod.resources.request_memory - if pod.resources.request_cpu: - req['spec']['containers'][0]['resources']['requests'][ - 'cpu'] = pod.resources.request_cpu - - if pod.resources.has_limits(): - req['spec']['containers'][0]['resources']['limits'] = {} - if pod.resources.limit_memory: - req['spec']['containers'][0]['resources']['limits'][ - 'memory'] = pod.resources.limit_memory - if pod.resources.limit_cpu: - req['spec']['containers'][0]['resources']['limits'][ - 'cpu'] = pod.resources.limit_cpu - if pod.resources.limit_gpu: - req['spec']['containers'][0]['resources']['limits'][ - 'nvidia.com/gpu'] = pod.resources.limit_gpu - - @staticmethod - def extract_init_containers(pod, req): - if pod.init_containers: - req['spec']['initContainers'] = pod.init_containers - - @staticmethod - def extract_service_account_name(pod, req): - if pod.service_account_name: - req['spec']['serviceAccountName'] = pod.service_account_name - - @staticmethod - def extract_hostnetwork(pod, req): - if pod.hostnetwork: - req['spec']['hostNetwork'] = pod.hostnetwork - - @staticmethod - def extract_dnspolicy(pod, req): - if pod.dnspolicy: - req['spec']['dnsPolicy'] = pod.dnspolicy - - @staticmethod - def extract_image_pull_secrets(pod, req): - if pod.image_pull_secrets: - req['spec']['imagePullSecrets'] = [{ - 'name': pull_secret - } for pull_secret in pod.image_pull_secrets.split(',')] - - @staticmethod - def extract_tolerations(pod, req): - if pod.tolerations: - req['spec']['tolerations'] = pod.tolerations - - @staticmethod - def extract_security_context(pod, req): - if pod.security_context: - req['spec']['securityContext'] = pod.security_context - - @staticmethod - def _apply_env_from(pod, req): - envs_from_secrets = [ - env for env in pod.secrets if env.deploy_type == 'env' and env.key is None - ] - - if pod.configmaps or envs_from_secrets: - req['spec']['containers'][0]['envFrom'] = [] - - for secret in envs_from_secrets: - req['spec']['containers'][0]['envFrom'].append( - { - 'secretRef': { - 'name': secret.secret - } - } - ) - - for configmap in pod.configmaps: - req['spec']['containers'][0]['envFrom'].append( - { - 'configMapRef': { - 'name': configmap - } - } - ) diff --git a/airflow/contrib/kubernetes/kubernetes_request_factory/pod_request_factory.py b/airflow/contrib/kubernetes/kubernetes_request_factory/pod_request_factory.py deleted file mode 100644 index 4c2ea25ca2f87..0000000000000 --- a/airflow/contrib/kubernetes/kubernetes_request_factory/pod_request_factory.py +++ /dev/null @@ -1,135 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import yaml -from airflow.contrib.kubernetes.pod import Pod -from airflow.contrib.kubernetes.kubernetes_request_factory.kubernetes_request_factory \ - import KubernetesRequestFactory - - -class SimplePodRequestFactory(KubernetesRequestFactory): - """ - Request generator for a simple pod. - """ - _yaml = """apiVersion: v1 -kind: Pod -metadata: - name: name -spec: - containers: - - name: base - image: airflow-worker:latest - command: ["/usr/local/airflow/entrypoint.sh", "/bin/bash sleep 25"] - restartPolicy: Never - """ - - def __init__(self): - pass - - def create(self, pod): - # type: (Pod) -> dict - req = yaml.safe_load(self._yaml) - self.extract_name(pod, req) - self.extract_labels(pod, req) - self.extract_image(pod, req) - self.extract_image_pull_policy(pod, req) - self.extract_cmds(pod, req) - self.extract_args(pod, req) - self.extract_node_selector(pod, req) - self.extract_env_and_secrets(pod, req) - self.extract_volume_secrets(pod, req) - self.attach_ports(pod, req) - self.attach_volumes(pod, req) - self.attach_volume_mounts(pod, req) - self.extract_resources(pod, req) - self.extract_service_account_name(pod, req) - self.extract_init_containers(pod, req) - self.extract_image_pull_secrets(pod, req) - self.extract_annotations(pod, req) - self.extract_affinity(pod, req) - self.extract_hostnetwork(pod, req) - self.extract_tolerations(pod, req) - self.extract_security_context(pod, req) - self.extract_dnspolicy(pod, req) - return req - - -class ExtractXcomPodRequestFactory(KubernetesRequestFactory): - """ - Request generator for a pod with sidecar container. - """ - XCOM_MOUNT_PATH = '/airflow/xcom' - SIDECAR_CONTAINER_NAME = 'airflow-xcom-sidecar' - _yaml = """apiVersion: v1 -kind: Pod -metadata: - name: name -spec: - volumes: - - name: xcom - emptyDir: {{}} - containers: - - name: base - image: airflow-worker:latest - command: ["/usr/local/airflow/entrypoint.sh", "/bin/bash sleep 25"] - volumeMounts: - - name: xcom - mountPath: {xcomMountPath} - - name: {sidecarContainerName} - image: alpine - command: - - sh - - -c - - 'trap "exit 0" INT; while true; do sleep 30; done;' - volumeMounts: - - name: xcom - mountPath: {xcomMountPath} - resources: - requests: - cpu: 1m - restartPolicy: Never - """.format(xcomMountPath=XCOM_MOUNT_PATH, sidecarContainerName=SIDECAR_CONTAINER_NAME) - - def __init__(self): - pass - - def create(self, pod): - # type: (Pod) -> dict - req = yaml.safe_load(self._yaml) - self.extract_name(pod, req) - self.extract_labels(pod, req) - self.extract_image(pod, req) - self.extract_image_pull_policy(pod, req) - self.extract_cmds(pod, req) - self.extract_args(pod, req) - self.extract_node_selector(pod, req) - self.extract_env_and_secrets(pod, req) - self.extract_volume_secrets(pod, req) - self.attach_ports(pod, req) - self.attach_volumes(pod, req) - self.attach_volume_mounts(pod, req) - self.extract_resources(pod, req) - self.extract_service_account_name(pod, req) - self.extract_init_containers(pod, req) - self.extract_image_pull_secrets(pod, req) - self.extract_annotations(pod, req) - self.extract_affinity(pod, req) - self.extract_hostnetwork(pod, req) - self.extract_tolerations(pod, req) - self.extract_security_context(pod, req) - self.extract_dnspolicy(pod, req) - return req diff --git a/airflow/contrib/kubernetes/pod.py b/airflow/contrib/kubernetes/pod.py index 4c10e66590647..7e38147eff301 100644 --- a/airflow/contrib/kubernetes/pod.py +++ b/airflow/contrib/kubernetes/pod.py @@ -14,51 +14,29 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""This module is deprecated. Please use `airflow.kubernetes.pod`.""" +import warnings -class Resources: - __slots__ = ('request_memory', 'request_cpu', 'limit_memory', 'limit_cpu', 'limit_gpu') +# pylint: disable=unused-import +from typing import List, Union - def __init__( - self, - request_memory=None, - request_cpu=None, - limit_memory=None, - limit_cpu=None, - limit_gpu=None): - self.request_memory = request_memory - self.request_cpu = request_cpu - self.limit_memory = limit_memory - self.limit_cpu = limit_cpu - self.limit_gpu = limit_gpu - - def is_empty_resource_request(self): - return not self.has_limits() and not self.has_requests() - - def has_limits(self): - return self.limit_cpu is not None or self.limit_memory is not None or self.limit_gpu is not None - - def has_requests(self): - return self.request_cpu is not None or self.request_memory is not None - - def __str__(self): - return "Request: [cpu: {}, memory: {}], Limit: [cpu: {}, memory: {}, gpu: {}]".format( - self.request_cpu, self.request_memory, self.limit_cpu, self.limit_memory, self.limit_gpu - ) +from kubernetes.client import models as k8s +from airflow.kubernetes.pod import Port, Resources # noqa +from airflow.kubernetes.volume import Volume +from airflow.kubernetes.volume_mount import VolumeMount +from airflow.kubernetes.secret import Secret -class Port: - def __init__( - self, - name=None, - container_port=None): - self.name = name - self.container_port = container_port +from kubernetes.client.api_client import ApiClient +api_client = ApiClient() -class Pod: + +class Pod(object): """ Represents a kubernetes pod and manages execution of a single pod. + :param image: The docker image :type image: str :param envs: A dict containing the environment variables @@ -122,6 +100,10 @@ def __init__( pod_runtime_info_envs=None, dnspolicy=None ): + warnings.warn( + "Using `airflow.contrib.kubernetes.pod.Pod` is deprecated. Please use `k8s.V1Pod`.", + DeprecationWarning, stacklevel=2 + ) self.image = image self.envs = envs or {} self.cmds = cmds @@ -136,7 +118,7 @@ def __init__( self.node_selectors = node_selectors or {} self.namespace = namespace self.image_pull_policy = image_pull_policy - self.image_pull_secrets = image_pull_secrets + self.image_pull_secrets = image_pull_secrets or "" self.init_containers = init_containers self.service_account_name = service_account_name self.resources = resources or Resources() @@ -144,7 +126,182 @@ def __init__( self.affinity = affinity or {} self.hostnetwork = hostnetwork or False self.tolerations = tolerations or [] - self.security_context = security_context + self.security_context = security_context or {} self.configmaps = configmaps or [] self.pod_runtime_info_envs = pod_runtime_info_envs or [] self.dnspolicy = dnspolicy + + def to_v1_kubernetes_pod(self): + """ + Convert to support k8s V1Pod + + :return: k8s.V1Pod + """ + import kubernetes.client.models as k8s + meta = k8s.V1ObjectMeta( + labels=self.labels, + name=self.name, + namespace=self.namespace, + annotations=self.annotations, + ) + if self.image_pull_secrets: + image_pull_secrets = [k8s.V1LocalObjectReference(i) + for i in self.image_pull_secrets.split(",")] + else: + image_pull_secrets = [] + spec = k8s.V1PodSpec( + init_containers=self.init_containers, + containers=[ + k8s.V1Container( + image=self.image, + command=self.cmds, + env_from=[], + name="base", + env=[k8s.V1EnvVar(name=key, value=val) for key, val in self.envs.items()], + args=self.args, + image_pull_policy=self.image_pull_policy, + ) + ], + image_pull_secrets=image_pull_secrets, + service_account_name=self.service_account_name, + node_selector=self.node_selectors, + dns_policy=self.dnspolicy, + host_network=self.hostnetwork, + tolerations=self.tolerations, + affinity=self.affinity, + security_context=self.security_context, + ) + + pod = k8s.V1Pod( + spec=spec, + metadata=meta, + ) + for configmap_name in self.configmaps: + env_var = k8s.V1EnvFromSource( + config_map_ref=k8s.V1ConfigMapEnvSource( + name=configmap_name, + ) + ) + pod.spec.containers[0].env_from.append(env_var) + + for port in _extract_ports(self.ports): + pod = port.attach_to_pod(pod) + volumes = _extract_volumes(self.volumes) + for volume in volumes: + pod = volume.attach_to_pod(pod) + for volume_mount in _extract_volume_mounts(self.volume_mounts): + pod = volume_mount.attach_to_pod(pod) + for secret in self.secrets: + pod = secret.attach_to_pod(pod) + for runtime_info in self.pod_runtime_info_envs: + pod = runtime_info.attach_to_pod(pod) + pod = _extract_resources(self.resources).attach_to_pod(pod) + return pod + + def as_dict(self): + res = self.__dict__ + res['resources'] = res['resources'].as_dict() + res['ports'] = [port.as_dict() for port in res['ports']] + res['volume_mounts'] = [volume_mount.as_dict() for volume_mount in res['volume_mounts']] + res['volumes'] = [volume.as_dict() for volume in res['volumes']] + + return res + + +def _extract_env_vars_and_secrets(env_vars): + """ + Extracts environment variables and Secret objects from V1Pod Environment + """ + result = {} + env_vars = env_vars or [] # type: List[Union[k8s.V1EnvVar, dict]] + secrets = [] + for env_var in env_vars: + if isinstance(env_var, k8s.V1EnvVar): + secret = _extract_env_secret(env_var) + if secret: + secrets.append(secret) + continue + env_var = api_client.sanitize_for_serialization(env_var) + result[env_var.get("name")] = env_var.get("value") + return result, secrets + + +def _extract_env_secret(env_var): + if env_var.value_from and env_var.value_from.secret_key_ref: + secret = env_var.value_from.secret_key_ref # type: k8s.V1SecretKeySelector + name = secret.name + key = secret.key + return Secret("env", deploy_target=env_var.name, secret=name, key=key) + return None + + +def _extract_ports(ports): + result = [] + ports = ports or [] # type: List[Union[k8s.V1ContainerPort, dict]] + for port in ports: + if isinstance(port, k8s.V1ContainerPort): + port = api_client.sanitize_for_serialization(port) + port = Port(name=port.get("name"), container_port=port.get("containerPort")) + elif not isinstance(port, Port): + port = Port(name=port.get("name"), container_port=port.get("containerPort")) + result.append(port) + return result + + +def _extract_resources(resources): + if isinstance(resources, k8s.V1ResourceRequirements): + requests = resources.requests or {} + limits = resources.limits or {} + return Resources( + request_memory=requests.get('memory', None), + request_cpu=requests.get('cpu', None), + request_ephemeral_storage=requests.get('ephemeral-storage', None), + limit_memory=limits.get('memory', None), + limit_cpu=limits.get('cpu', None), + limit_ephemeral_storage=limits.get('ephemeral-storage', None), + limit_gpu=limits.get('nvidia.com/gpu') + ) + elif isinstance(resources, Resources): + return resources + + +def _extract_security_context(security_context): + if isinstance(security_context, k8s.V1PodSecurityContext): + security_context = api_client.sanitize_for_serialization(security_context) + return security_context + + +def _extract_volume_mounts(volume_mounts): + result = [] + volume_mounts = volume_mounts or [] # type: List[Union[k8s.V1VolumeMount, dict]] + for volume_mount in volume_mounts: + if isinstance(volume_mount, k8s.V1VolumeMount): + volume_mount = api_client.sanitize_for_serialization(volume_mount) + volume_mount = VolumeMount( + name=volume_mount.get("name"), + mount_path=volume_mount.get("mountPath"), + sub_path=volume_mount.get("subPath"), + read_only=volume_mount.get("readOnly") + ) + elif not isinstance(volume_mount, VolumeMount): + volume_mount = VolumeMount( + name=volume_mount.get("name"), + mount_path=volume_mount.get("mountPath"), + sub_path=volume_mount.get("subPath"), + read_only=volume_mount.get("readOnly") + ) + result.append(volume_mount) + return result + + +def _extract_volumes(volumes): + result = [] + volumes = volumes or [] # type: List[Union[k8s.V1Volume, dict]] + for volume in volumes: + if isinstance(volume, k8s.V1Volume): + volume = api_client.sanitize_for_serialization(volume) + volume = Volume(name=volume.get("name"), configs=volume) + if not isinstance(volume, Volume): + volume = Volume(name=volume.get("name"), configs=volume) + result.append(volume) + return result diff --git a/airflow/contrib/kubernetes/pod_generator.py b/airflow/contrib/kubernetes/pod_generator.py deleted file mode 100644 index e55cc51e8e14a..0000000000000 --- a/airflow/contrib/kubernetes/pod_generator.py +++ /dev/null @@ -1,181 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from airflow.contrib.kubernetes.pod import Pod, Port -from airflow.contrib.kubernetes.volume import Volume -from airflow.contrib.kubernetes.volume_mount import VolumeMount -import uuid - - -class PodGenerator: - """Contains Kubernetes Airflow Worker configuration logic""" - - def __init__(self, kube_config=None): - self.kube_config = kube_config - self.ports = [] - self.volumes = [] - self.volume_mounts = [] - self.init_containers = [] - - def add_init_container(self, - name, - image, - security_context, - init_environment, - volume_mounts - ): - """ - - Adds an init container to the launched pod. useful for pre- - - Args: - name (str): - image (str): - security_context (dict): - init_environment (dict): - volume_mounts (dict): - - Returns: - - """ - self.init_containers.append( - { - 'name': name, - 'image': image, - 'securityContext': security_context, - 'env': init_environment, - 'volumeMounts': volume_mounts - } - ) - - def _get_init_containers(self): - return self.init_containers - - def add_port(self, port): # type: (Port) -> None - """ - Adds a Port to the generator - - :param port: ports for generated pod - :type port: airflow.contrib.kubernetes.pod.Port - """ - self.ports.append({'name': port.name, 'containerPort': port.container_port}) - - def add_volume(self, volume): # type: (Volume) -> None - """ - Adds a Volume to the generator - - :param volume: volume for generated pod - :type volume: airflow.contrib.kubernetes.volume.Volume - """ - - self._add_volume(name=volume.name, configs=volume.configs) - - def _add_volume(self, name, configs): - """ - - Args: - name (str): - configs (dict): Configurations for the volume. - Could be used to define PersistentVolumeClaim, ConfigMap, etc... - - Returns: - - """ - volume_map = {'name': name} - for k, v in configs.items(): - volume_map[k] = v - - self.volumes.append(volume_map) - - def add_volume_with_configmap(self, name, config_map): - self.volumes.append( - { - 'name': name, - 'configMap': config_map - } - ) - - def _add_mount(self, - name, - mount_path, - sub_path, - read_only): - """ - - Args: - name (str): - mount_path (str): - sub_path (str): - read_only: - - Returns: - - """ - - self.volume_mounts.append({ - 'name': name, - 'mountPath': mount_path, - 'subPath': sub_path, - 'readOnly': read_only - }) - - def add_mount(self, - volume_mount, # type: VolumeMount - ): - """ - Adds a VolumeMount to the generator - - :param volume_mount: volume for generated pod - :type volume_mount: airflow.contrib.kubernetes.volume_mount.VolumeMount - """ - self._add_mount( - name=volume_mount.name, - mount_path=volume_mount.mount_path, - sub_path=volume_mount.sub_path, - read_only=volume_mount.read_only - ) - - def _get_volumes_and_mounts(self): - return self.volumes, self.volume_mounts - - def _get_image_pull_secrets(self): - """Extracts any image pull secrets for fetching container(s)""" - if not self.kube_config.image_pull_secrets: - return [] - return self.kube_config.image_pull_secrets.split(',') - - def make_pod(self, namespace, image, pod_id, cmds, arguments, labels): - volumes, volume_mounts = self._get_volumes_and_mounts() - worker_init_container_spec = self._get_init_containers() - - return Pod( - namespace=namespace, - name=pod_id + "-" + str(uuid.uuid4())[:8], - image=image, - cmds=cmds, - args=arguments, - labels=labels, - envs={}, - secrets=[], - # service_account_name=self.kube_config.worker_service_account_name, - # image_pull_secrets=self.kube_config.image_pull_secrets, - init_containers=worker_init_container_spec, - ports=self.ports, - volumes=volumes, - volume_mounts=volume_mounts, - resources=None - ) diff --git a/airflow/contrib/kubernetes/pod_runtime_info_env.py b/airflow/contrib/kubernetes/pod_runtime_info_env.py index f52791ed43c91..0dc8aedb3612f 100644 --- a/airflow/contrib/kubernetes/pod_runtime_info_env.py +++ b/airflow/contrib/kubernetes/pod_runtime_info_env.py @@ -14,23 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -""" -Classes for using Kubernetes Downward API -""" +"""This module is deprecated. Please use `airflow.kubernetes.pod_runtime_info_env`.""" +import warnings -class PodRuntimeInfoEnv: - """Defines Pod runtime information as environment variable""" +# pylint: disable=unused-import +from airflow.kubernetes.pod_runtime_info_env import PodRuntimeInfoEnv # noqa - def __init__(self, name, field_path): - """ - Adds Kubernetes pod runtime information as environment variables such as namespace, pod IP, pod name. - Full list of options can be found in kubernetes documentation. - - :param name: the name of the environment variable - :type: name: str - :param field_path: path to pod runtime info. Ex: metadata.namespace | status.podIP - :type: field_path: str - """ - self.name = name - self.field_path = field_path +warnings.warn( + "This module is deprecated. Please use `airflow.kubernetes.pod_runtime_info_env`.", + DeprecationWarning, stacklevel=2 +) diff --git a/airflow/contrib/kubernetes/refresh_config.py b/airflow/contrib/kubernetes/refresh_config.py index b060d258ed19b..f88069ef0ce91 100644 --- a/airflow/contrib/kubernetes/refresh_config.py +++ b/airflow/contrib/kubernetes/refresh_config.py @@ -14,106 +14,16 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""This module is deprecated. Please use `airflow.kubernetes.refresh_config`.""" -""" -NOTE: this module can be removed once upstream client supports token refresh -see: https://github.com/kubernetes-client/python/issues/741 -""" +import warnings -import calendar -import logging -import os -import time -from datetime import datetime +# pylint: disable=unused-import +from airflow.kubernetes.refresh_config import ( # noqa + RefreshConfiguration, RefreshKubeConfigLoader, load_kube_config +) -import yaml -from kubernetes.client import Configuration -from kubernetes.config.exec_provider import ExecProvider -from kubernetes.config.kube_config import KUBE_CONFIG_DEFAULT_LOCATION, KubeConfigLoader - - -class RefreshKubeConfigLoader(KubeConfigLoader): - """ - Patched KubeConfigLoader, this subclass takes expirationTimestamp into - account and sets api key refresh callback hook in Configuration object - """ - def __init__(self, *args, **kwargs): - KubeConfigLoader.__init__(self, *args, **kwargs) - self.api_key_expire_ts = None - - def _load_from_exec_plugin(self): - """ - We override _load_from_exec_plugin method to also read and store - expiration timestamp for aws-iam-authenticator. It will be later - used for api token refresh. - """ - if 'exec' not in self._user: - return None - try: - status = ExecProvider(self._user['exec']).run() - if 'token' not in status: - logging.error('exec: missing token field in plugin output') - return None - self.token = "Bearer %s" % status['token'] # pylint: disable=W0201 - ts_str = status.get('expirationTimestamp') - if ts_str: - self.api_key_expire_ts = calendar.timegm( - datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z").timetuple(), - ) - return True - except Exception as e: # pylint: disable=W0703 - logging.error(str(e)) - - def refresh_api_key(self, client_configuration): - """ - Refresh API key if expired - """ - if self.api_key_expire_ts and time.time() >= self.api_key_expire_ts: - self.load_and_set(client_configuration) - - def load_and_set(self, client_configuration): - KubeConfigLoader.load_and_set(self, client_configuration) - client_configuration.refresh_api_key = self.refresh_api_key - - -class RefreshConfiguration(Configuration): - """ - Patched Configuration, this subclass taskes api key refresh callback hook - into account - """ - def __init__(self, *args, **kwargs): - Configuration.__init__(self, *args, **kwargs) - self.refresh_api_key = None - - def get_api_key_with_prefix(self, identifier): - if self.refresh_api_key: - self.refresh_api_key(self) # pylint: disable=E1102 - return Configuration.get_api_key_with_prefix(self, identifier) - - -def _get_kube_config_loader_for_yaml_file(filename, **kwargs): - """ - Adapted from the upstream _get_kube_config_loader_for_yaml_file function, changed - KubeConfigLoader to RefreshKubeConfigLoader - """ - with open(filename) as f: - return RefreshKubeConfigLoader( - config_dict=yaml.safe_load(f), - config_base_path=os.path.abspath(os.path.dirname(filename)), - **kwargs) - - -def load_kube_config(client_configuration, config_file=None, context=None): - """ - Adapted from the upstream load_kube_config function, changes: - - removed persist_config argument since it's not being used - - remove `client_configuration is None` branch since we always pass - in client configuration - """ - if config_file is None: - config_file = os.path.expanduser(KUBE_CONFIG_DEFAULT_LOCATION) - - loader = _get_kube_config_loader_for_yaml_file( - config_file, active_context=context, config_persister=None) - - loader.load_and_set(client_configuration) +warnings.warn( + "This module is deprecated. Please use `airflow.kubernetes.refresh_config`.", + DeprecationWarning, stacklevel=2 +) diff --git a/airflow/contrib/kubernetes/secret.py b/airflow/contrib/kubernetes/secret.py index fde1ded38d275..ad41d4d0b5079 100644 --- a/airflow/contrib/kubernetes/secret.py +++ b/airflow/contrib/kubernetes/secret.py @@ -14,57 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from airflow.exceptions import AirflowConfigException +"""This module is deprecated. Please use `airflow.kubernetes.secret`.""" +import warnings -class Secret(object): - """Defines Kubernetes Secret Volume""" +# pylint: disable=unused-import +from airflow.kubernetes.secret import Secret # noqa - def __init__(self, deploy_type, deploy_target, secret, key=None): - """ - Initialize a Kubernetes Secret Object. Used to track requested secrets from - the user. - - :param deploy_type: The type of secret deploy in Kubernetes, either `env` or - `volume` - :type deploy_type: str - :param deploy_target: (Optional) The environment variable when - `deploy_type` `env` or file path when `deploy_type` `volume` where - expose secret. If `key` is not provided deploy target should be None. - :type deploy_target: str or None - :param secret: Name of the secrets object in Kubernetes - :type secret: str - :param key: (Optional) Key of the secret within the Kubernetes Secret - if not provided in `deploy_type` `env` it will mount all secrets in object - :type key: str or None - """ - self.deploy_type = deploy_type - self.deploy_target = deploy_target - - if deploy_target is not None and deploy_type == 'env': - # if deploying to env, capitalize the deploy target - self.deploy_target = deploy_target.upper() - - if key is not None and deploy_target is None: - raise AirflowConfigException( - 'If `key` is set, `deploy_target` should not be None' - ) - - self.secret = secret - self.key = key - - def __eq__(self, other): - return ( - self.deploy_type == other.deploy_type and - self.deploy_target == other.deploy_target and - self.secret == other.secret and - self.key == other.key - ) - - def __repr__(self): - return 'Secret({}, {}, {}, {})'.format( - self.deploy_type, - self.deploy_target, - self.secret, - self.key - ) +warnings.warn( + "This module is deprecated. Please use `airflow.kubernetes.secret`.", + DeprecationWarning, stacklevel=2 +) diff --git a/airflow/contrib/kubernetes/volume.py b/airflow/contrib/kubernetes/volume.py index 94003fe48dcb3..c72e20837ddbd 100644 --- a/airflow/contrib/kubernetes/volume.py +++ b/airflow/contrib/kubernetes/volume.py @@ -14,20 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""This module is deprecated. Please use `airflow.kubernetes.volume`.""" +import warnings -class Volume: - """Defines Kubernetes Volume""" +# pylint: disable=unused-import +from airflow.kubernetes.volume import Volume # noqa - def __init__(self, name, configs): - """ Adds Kubernetes Volume to pod. allows pod to access features like ConfigMaps - and Persistent Volumes - :param name: the name of the volume mount - :type name: str - :param configs: dictionary of any features needed for volume. - We purposely keep this vague since there are multiple volume types with changing - configs. - :type configs: dict - """ - self.name = name - self.configs = configs +warnings.warn( + "This module is deprecated. Please use `airflow.kubernetes.volume`.", + DeprecationWarning, stacklevel=2 +) diff --git a/airflow/contrib/kubernetes/volume_mount.py b/airflow/contrib/kubernetes/volume_mount.py index 4bdf09c07c0e5..a474e3b7b37ff 100644 --- a/airflow/contrib/kubernetes/volume_mount.py +++ b/airflow/contrib/kubernetes/volume_mount.py @@ -14,24 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""This module is deprecated. Please use `airflow.kubernetes.volume_mount`.""" +import warnings -class VolumeMount: - """Defines Kubernetes Volume Mount""" +# pylint: disable=unused-import +from airflow.kubernetes.volume_mount import VolumeMount # noqa - def __init__(self, name, mount_path, sub_path, read_only): - """Initialize a Kubernetes Volume Mount. Used to mount pod level volumes to - running container. - :param name: the name of the volume mount - :type name: str - :param mount_path: - :type mount_path: str - :param sub_path: subpath within the volume mount - :type sub_path: str - :param read_only: whether to access pod with read-only mode - :type read_only: bool - """ - self.name = name - self.mount_path = mount_path - self.sub_path = sub_path - self.read_only = read_only +warnings.warn( + "This module is deprecated. Please use `airflow.kubernetes.volume_mount`.", + DeprecationWarning, stacklevel=2 +) diff --git a/airflow/contrib/kubernetes/worker_configuration.py b/airflow/contrib/kubernetes/worker_configuration.py deleted file mode 100644 index 9f4c3ae39e9ab..0000000000000 --- a/airflow/contrib/kubernetes/worker_configuration.py +++ /dev/null @@ -1,432 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os -import six - -from airflow.configuration import conf -from airflow.contrib.kubernetes.pod import Pod, Resources -from airflow.contrib.kubernetes.secret import Secret -from airflow.utils.log.logging_mixin import LoggingMixin -from airflow.version import version as airflow_version - - -class WorkerConfiguration(LoggingMixin): - """Contains Kubernetes Airflow Worker configuration logic""" - - dags_volume_name = 'airflow-dags' - logs_volume_name = 'airflow-logs' - git_sync_ssh_secret_volume_name = 'git-sync-ssh-key' - git_ssh_key_secret_key = 'gitSshKey' - git_sync_ssh_known_hosts_volume_name = 'git-sync-known-hosts' - git_ssh_known_hosts_configmap_key = 'known_hosts' - - def __init__(self, kube_config): - self.kube_config = kube_config - self.worker_airflow_home = self.kube_config.airflow_home - self.worker_airflow_dags = self.kube_config.dags_folder - self.worker_airflow_logs = self.kube_config.base_log_folder - - super(WorkerConfiguration, self).__init__() - - def _get_init_containers(self): - """When using git to retrieve the DAGs, use the GitSync Init Container""" - # If we're using volume claims to mount the dags, no init container is needed - if self.kube_config.dags_volume_claim or \ - self.kube_config.dags_volume_host or self.kube_config.dags_in_image: - return [] - - # Otherwise, define a git-sync init container - init_environment = [{ - 'name': 'GIT_SYNC_REPO', - 'value': self.kube_config.git_repo - }, { - 'name': 'GIT_SYNC_BRANCH', - 'value': self.kube_config.git_branch - }, { - 'name': 'GIT_SYNC_ROOT', - 'value': self.kube_config.git_sync_root - }, { - 'name': 'GIT_SYNC_DEST', - 'value': self.kube_config.git_sync_dest - }, { - 'name': 'GIT_SYNC_DEPTH', - 'value': '1' - }, { - 'name': 'GIT_SYNC_ONE_TIME', - 'value': 'true' - }, { - 'name': 'GIT_SYNC_REV', - 'value': self.kube_config.git_sync_rev - }] - - for env_var_name, env_var_val in six.iteritems(self.kube_config.kube_env_vars): - init_environment.append({ - 'name': env_var_name, - 'value': env_var_val - }) - - if self.kube_config.git_user: - init_environment.append({ - 'name': 'GIT_SYNC_USERNAME', - 'value': self.kube_config.git_user - }) - if self.kube_config.git_password: - init_environment.append({ - 'name': 'GIT_SYNC_PASSWORD', - 'value': self.kube_config.git_password - }) - - if self.kube_config.git_sync_credentials_secret: - init_environment.extend([ - { - 'name': 'GIT_SYNC_USERNAME', - 'valueFrom': { - 'secretKeyRef': { - 'name': self.kube_config.git_sync_credentials_secret, - 'key': 'GIT_SYNC_USERNAME' - } - } - }, - { - 'name': 'GIT_SYNC_PASSWORD', - 'valueFrom': { - 'secretKeyRef': { - 'name': self.kube_config.git_sync_credentials_secret, - 'key': 'GIT_SYNC_PASSWORD' - } - } - } - ]) - - volume_mounts = [{ - 'mountPath': self.kube_config.git_sync_root, - 'name': self.dags_volume_name, - 'readOnly': False - }] - if self.kube_config.git_ssh_key_secret_name: - volume_mounts.append({ - 'name': self.git_sync_ssh_secret_volume_name, - 'mountPath': '/etc/git-secret/ssh', - 'subPath': 'ssh' - }) - init_environment.extend([ - { - 'name': 'GIT_SSH_KEY_FILE', - 'value': '/etc/git-secret/ssh' - }, - { - 'name': 'GIT_SYNC_SSH', - 'value': 'true' - }]) - if self.kube_config.git_ssh_known_hosts_configmap_name: - volume_mounts.append({ - 'name': self.git_sync_ssh_known_hosts_volume_name, - 'mountPath': '/etc/git-secret/known_hosts', - 'subPath': 'known_hosts' - }) - init_environment.extend([ - { - 'name': 'GIT_KNOWN_HOSTS', - 'value': 'true' - }, - { - 'name': 'GIT_SSH_KNOWN_HOSTS_FILE', - 'value': '/etc/git-secret/known_hosts' - } - ]) - else: - init_environment.append({ - 'name': 'GIT_KNOWN_HOSTS', - 'value': 'false' - }) - - init_containers = [{ - 'name': self.kube_config.git_sync_init_container_name, - 'image': self.kube_config.git_sync_container, - 'env': init_environment, - 'volumeMounts': volume_mounts - }] - - if self.kube_config.git_sync_run_as_user != "": - init_containers[0]['securityContext'] = { - 'runAsUser': self.kube_config.git_sync_run_as_user # git-sync user - } - - return init_containers - - def _get_environment(self): - """Defines any necessary environment variables for the pod executor""" - env = {} - - for env_var_name, env_var_val in six.iteritems(self.kube_config.kube_env_vars): - env[env_var_name] = env_var_val - - env["AIRFLOW__CORE__EXECUTOR"] = "LocalExecutor" - - if self.kube_config.airflow_configmap: - env['AIRFLOW_HOME'] = self.worker_airflow_home - env['AIRFLOW__CORE__DAGS_FOLDER'] = self.worker_airflow_dags - if (not self.kube_config.airflow_configmap and - 'AIRFLOW__CORE__SQL_ALCHEMY_CONN' not in self.kube_config.kube_secrets): - env['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = conf.get("core", "SQL_ALCHEMY_CONN") - if self.kube_config.git_dags_folder_mount_point: - # /root/airflow/dags/repo/dags - dag_volume_mount_path = os.path.join( - self.kube_config.git_dags_folder_mount_point, - self.kube_config.git_sync_dest, # repo - self.kube_config.git_subpath # dags - ) - env['AIRFLOW__CORE__DAGS_FOLDER'] = dag_volume_mount_path - return env - - def _get_configmaps(self): - """Extracts any configmapRefs to envFrom""" - if not self.kube_config.env_from_configmap_ref: - return [] - return self.kube_config.env_from_configmap_ref.split(',') - - def _get_secrets(self): - """Defines any necessary secrets for the pod executor""" - worker_secrets = [] - - for env_var_name, obj_key_pair in six.iteritems(self.kube_config.kube_secrets): - k8s_secret_obj, k8s_secret_key = obj_key_pair.split('=') - worker_secrets.append( - Secret('env', env_var_name, k8s_secret_obj, k8s_secret_key) - ) - - if self.kube_config.env_from_secret_ref: - for secret_ref in self.kube_config.env_from_secret_ref.split(','): - worker_secrets.append( - Secret('env', None, secret_ref) - ) - - return worker_secrets - - def _get_image_pull_secrets(self): - """Extracts any image pull secrets for fetching container(s)""" - if not self.kube_config.image_pull_secrets: - return [] - return self.kube_config.image_pull_secrets.split(',') - - def _get_security_context(self): - """Defines the security context""" - security_context = {} - - if self.kube_config.worker_run_as_user != "": - security_context['runAsUser'] = self.kube_config.worker_run_as_user - - if self.kube_config.worker_fs_group != "": - security_context['fsGroup'] = self.kube_config.worker_fs_group - - # set fs_group to 65533 if not explicitly specified and using git ssh keypair auth - if self.kube_config.git_ssh_key_secret_name and security_context.get('fsGroup') is None: - security_context['fsGroup'] = 65533 - - return security_context - - def _get_labels(self, kube_executor_labels, labels): - copy = self.kube_config.kube_labels.copy() - copy.update(kube_executor_labels) - copy.update(labels) - return copy - - def _get_volumes_and_mounts(self): - def _construct_volume(name, claim, host): - volume = { - 'name': name - } - if claim: - volume['persistentVolumeClaim'] = { - 'claimName': claim - } - elif host: - volume['hostPath'] = { - 'path': host, - 'type': '' - } - else: - volume['emptyDir'] = {} - return volume - - volumes = { - self.dags_volume_name: _construct_volume( - self.dags_volume_name, - self.kube_config.dags_volume_claim, - self.kube_config.dags_volume_host - ), - self.logs_volume_name: _construct_volume( - self.logs_volume_name, - self.kube_config.logs_volume_claim, - self.kube_config.logs_volume_host - ) - } - - volume_mounts = { - self.dags_volume_name: { - 'name': self.dags_volume_name, - 'mountPath': self.generate_dag_volume_mount_path(), - 'readOnly': True, - }, - self.logs_volume_name: { - 'name': self.logs_volume_name, - 'mountPath': self.worker_airflow_logs, - } - } - - if self.kube_config.dags_volume_subpath: - volume_mounts[self.dags_volume_name]['subPath'] = self.kube_config.dags_volume_subpath - - if self.kube_config.logs_volume_subpath: - volume_mounts[self.logs_volume_name]['subPath'] = self.kube_config.logs_volume_subpath - - if self.kube_config.dags_in_image: - del volumes[self.dags_volume_name] - del volume_mounts[self.dags_volume_name] - - # Get the SSH key from secrets as a volume - if self.kube_config.git_ssh_key_secret_name: - volumes[self.git_sync_ssh_secret_volume_name] = { - 'name': self.git_sync_ssh_secret_volume_name, - 'secret': { - 'secretName': self.kube_config.git_ssh_key_secret_name, - 'items': [{ - 'key': self.git_ssh_key_secret_key, - 'path': 'ssh', - 'mode': 0o440 - }] - } - } - - if self.kube_config.git_ssh_known_hosts_configmap_name: - volumes[self.git_sync_ssh_known_hosts_volume_name] = { - 'name': self.git_sync_ssh_known_hosts_volume_name, - 'configMap': { - 'name': self.kube_config.git_ssh_known_hosts_configmap_name - }, - 'mode': 0o440 - } - - if self.kube_config.airflow_local_settings_configmap: - config_path = '{}/config/airflow_local_settings.py'.format(self.worker_airflow_home) - - if self.kube_config.airflow_local_settings_configmap != self.kube_config.airflow_configmap: - config_volume_name = 'airflow-local-settings' - volumes[config_volume_name] = { - 'name': config_volume_name, - 'configMap': { - 'name': self.kube_config.airflow_local_settings_configmap - } - } - - volume_mounts[config_volume_name] = { - 'name': config_volume_name, - 'mountPath': config_path, - 'subPath': 'airflow_local_settings.py', - 'readOnly': True - } - - else: - volume_mounts['airflow-local-settings'] = { - 'name': 'airflow-config', - 'mountPath': config_path, - 'subPath': 'airflow_local_settings.py', - 'readOnly': True - } - - # Mount the airflow.cfg file via a configmap the user has specified - if self.kube_config.airflow_configmap: - config_volume_name = 'airflow-config' - config_path = '{}/airflow.cfg'.format(self.worker_airflow_home) - volumes[config_volume_name] = { - 'name': config_volume_name, - 'configMap': { - 'name': self.kube_config.airflow_configmap - } - } - volume_mounts[config_volume_name] = { - 'name': config_volume_name, - 'mountPath': config_path, - 'subPath': 'airflow.cfg', - 'readOnly': True - } - - return volumes, volume_mounts - - def generate_dag_volume_mount_path(self): - if self.kube_config.dags_volume_claim or self.kube_config.dags_volume_host: - dag_volume_mount_path = self.worker_airflow_dags - else: - dag_volume_mount_path = self.kube_config.git_dags_folder_mount_point - - return dag_volume_mount_path - - def make_pod(self, namespace, worker_uuid, pod_id, dag_id, task_id, execution_date, - try_number, airflow_command, kube_executor_config): - volumes_dict, volume_mounts_dict = self._get_volumes_and_mounts() - worker_init_container_spec = self._get_init_containers() - resources = Resources( - request_memory=kube_executor_config.request_memory, - request_cpu=kube_executor_config.request_cpu, - limit_memory=kube_executor_config.limit_memory, - limit_cpu=kube_executor_config.limit_cpu, - limit_gpu=kube_executor_config.limit_gpu - ) - gcp_sa_key = kube_executor_config.gcp_service_account_key - annotations = dict(kube_executor_config.annotations) or self.kube_config.kube_annotations - if gcp_sa_key: - annotations['iam.cloud.google.com/service-account'] = gcp_sa_key - - volumes = [value for value in volumes_dict.values()] + kube_executor_config.volumes - volume_mounts = [value for value in volume_mounts_dict.values()] + kube_executor_config.volume_mounts - - affinity = kube_executor_config.affinity or self.kube_config.kube_affinity - tolerations = kube_executor_config.tolerations or self.kube_config.kube_tolerations - - return Pod( - namespace=namespace, - name=pod_id, - image=kube_executor_config.image or self.kube_config.kube_image, - image_pull_policy=(kube_executor_config.image_pull_policy or - self.kube_config.kube_image_pull_policy), - cmds=airflow_command, - labels=self._get_labels(kube_executor_config.labels, { - 'airflow-worker': worker_uuid, - 'dag_id': dag_id, - 'task_id': task_id, - 'execution_date': execution_date, - 'try_number': str(try_number), - 'airflow_version': airflow_version.replace('+', '-'), - 'kubernetes_executor': 'True', - }), - envs=self._get_environment(), - secrets=self._get_secrets(), - service_account_name=self.kube_config.worker_service_account_name, - image_pull_secrets=self.kube_config.image_pull_secrets, - init_containers=worker_init_container_spec, - volumes=volumes, - volume_mounts=volume_mounts, - resources=resources, - annotations=annotations, - node_selectors=(kube_executor_config.node_selectors or - self.kube_config.kube_node_selectors), - affinity=affinity, - tolerations=tolerations, - security_context=self._get_security_context(), - configmaps=self._get_configmaps() - ) diff --git a/airflow/contrib/operators/bigquery_check_operator.py b/airflow/contrib/operators/bigquery_check_operator.py index 355929135283e..c34fd52f7083e 100644 --- a/airflow/contrib/operators/bigquery_check_operator.py +++ b/airflow/contrib/operators/bigquery_check_operator.py @@ -18,12 +18,12 @@ # under the License. from airflow.contrib.hooks.bigquery_hook import BigQueryHook -from airflow.operators.check_operator import \ - CheckOperator, ValueCheckOperator, IntervalCheckOperator +from airflow.operators.sql import \ + SQLCheckOperator, SQLValueCheckOperator, SQLIntervalCheckOperator from airflow.utils.decorators import apply_defaults -class BigQueryCheckOperator(CheckOperator): +class BigQueryCheckOperator(SQLCheckOperator): """ Performs checks against BigQuery. The ``BigQueryCheckOperator`` expects a sql query that will return a single row. Each value on that @@ -79,7 +79,7 @@ def get_db_hook(self): use_legacy_sql=self.use_legacy_sql) -class BigQueryValueCheckOperator(ValueCheckOperator): +class BigQueryValueCheckOperator(SQLValueCheckOperator): """ Performs a simple value check using sql code. @@ -111,7 +111,7 @@ def get_db_hook(self): use_legacy_sql=self.use_legacy_sql) -class BigQueryIntervalCheckOperator(IntervalCheckOperator): +class BigQueryIntervalCheckOperator(SQLIntervalCheckOperator): """ Checks that the values of metrics given as SQL expressions are within a certain tolerance of the ones from days_back before. diff --git a/airflow/contrib/operators/bigquery_get_data.py b/airflow/contrib/operators/bigquery_get_data.py index f5e6e50f066d5..288bf5f01468a 100644 --- a/airflow/contrib/operators/bigquery_get_data.py +++ b/airflow/contrib/operators/bigquery_get_data.py @@ -45,7 +45,7 @@ class BigQueryGetDataOperator(BaseOperator): task_id='get_data_from_bq', dataset_id='test_dataset', table_id='Transaction_partitions', - max_results='100', + max_results=100, selected_fields='DATE', bigquery_conn_id='airflow-service-account' ) @@ -56,7 +56,7 @@ class BigQueryGetDataOperator(BaseOperator): :type table_id: str :param max_results: The maximum number of records (rows) to be fetched from the table. (templated) - :type max_results: str + :type max_results: int :param selected_fields: List of fields to return (comma-separated). If unspecified, all fields are returned. :type selected_fields: str @@ -74,7 +74,7 @@ class BigQueryGetDataOperator(BaseOperator): def __init__(self, dataset_id, table_id, - max_results='100', + max_results=100, selected_fields=None, bigquery_conn_id='bigquery_default', delegate_to=None, diff --git a/airflow/contrib/operators/dataproc_operator.py b/airflow/contrib/operators/dataproc_operator.py index a5e126b2288cc..ff07be59b79e8 100644 --- a/airflow/contrib/operators/dataproc_operator.py +++ b/airflow/contrib/operators/dataproc_operator.py @@ -1057,7 +1057,7 @@ class DataProcPySparkOperator(DataProcJobBaseOperator): Start a PySpark Job on a Cloud DataProc cluster. :param main: [Required] The Hadoop Compatible Filesystem (HCFS) URI of the main - Python file to use as the driver. Must be a .py file. + Python file to use as the driver. Must be a .py file. (templated) :type main: str :param arguments: Arguments for the job. (templated) :type arguments: list @@ -1077,7 +1077,7 @@ class DataProcPySparkOperator(DataProcJobBaseOperator): :type dataproc_pyspark_jars: list """ - template_fields = ['arguments', 'job_name', 'cluster_name', + template_fields = ['main', 'arguments', 'job_name', 'cluster_name', 'region', 'dataproc_jars', 'dataproc_properties'] ui_color = '#0273d4' job_type = 'pysparkJob' diff --git a/airflow/contrib/operators/emr_add_steps_operator.py b/airflow/contrib/operators/emr_add_steps_operator.py index 0075b1b7bf14a..1917752d06592 100644 --- a/airflow/contrib/operators/emr_add_steps_operator.py +++ b/airflow/contrib/operators/emr_add_steps_operator.py @@ -66,12 +66,14 @@ def __init__( self.steps = steps def execute(self, context): - emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn() + emr_hook = EmrHook(aws_conn_id=self.aws_conn_id) - job_flow_id = self.job_flow_id + emr = emr_hook.get_conn() + job_flow_id = self.job_flow_id or emr_hook.get_cluster_id_by_name(self.job_flow_name, + self.cluster_states) if not job_flow_id: - job_flow_id = emr.get_cluster_id_by_name(self.job_flow_name, self.cluster_states) + raise AirflowException('No cluster found for name: ' + self.job_flow_name) if self.do_xcom_push: context['ti'].xcom_push(key='job_flow_id', value=job_flow_id) diff --git a/airflow/contrib/operators/file_to_wasb.py b/airflow/contrib/operators/file_to_wasb.py index 27760af9fc78a..8cb9fca6fe29a 100644 --- a/airflow/contrib/operators/file_to_wasb.py +++ b/airflow/contrib/operators/file_to_wasb.py @@ -57,8 +57,7 @@ def execute(self, context): """Upload a file to Azure Blob Storage.""" hook = WasbHook(wasb_conn_id=self.wasb_conn_id) self.log.info( - 'Uploading %s to wasb://%s ' - 'as %s'.format(self.file_path, self.container_name, self.blob_name) - ) + 'Uploading %s to wasb://%s ' # noqa: F523 + 'as %s', self.file_path, self.container_name, self.blob_name) hook.load_file(self.file_path, self.container_name, self.blob_name, **self.load_options) diff --git a/airflow/contrib/operators/gcp_sql_operator.py b/airflow/contrib/operators/gcp_sql_operator.py index 4e0c279baf37c..9a9384e833cc2 100644 --- a/airflow/contrib/operators/gcp_sql_operator.py +++ b/airflow/contrib/operators/gcp_sql_operator.py @@ -683,7 +683,7 @@ class CloudSqlInstanceImportOperator(CloudSqlBaseOperator): :type validate_body: bool """ # [START gcp_sql_import_template_fields] - template_fields = ('project_id', 'instance', 'gcp_conn_id', 'api_version') + template_fields = ('project_id', 'instance', 'gcp_conn_id', 'api_version', 'body') # [END gcp_sql_import_template_fields] @apply_defaults diff --git a/airflow/contrib/operators/kubernetes_pod_operator.py b/airflow/contrib/operators/kubernetes_pod_operator.py index f692599d76bb5..dcd6c3e0ed2ea 100644 --- a/airflow/contrib/operators/kubernetes_pod_operator.py +++ b/airflow/contrib/operators/kubernetes_pod_operator.py @@ -15,28 +15,38 @@ # specific language governing permissions and limitations # under the License. """Executes task in a Kubernetes POD""" + import re -import warnings +import yaml from airflow.exceptions import AirflowException +from airflow.kubernetes.k8s_model import append_to_pod +from airflow.kubernetes import kube_client, pod_generator, pod_launcher +from airflow.kubernetes.pod import Resources from airflow.models import BaseOperator from airflow.utils.decorators import apply_defaults -from airflow.contrib.kubernetes import kube_client, pod_generator, pod_launcher -from airflow.contrib.kubernetes.pod import Resources from airflow.utils.helpers import validate_key from airflow.utils.state import State from airflow.version import version as airflow_version +from airflow.kubernetes.pod_generator import PodGenerator +from kubernetes.client import models as k8s class KubernetesPodOperator(BaseOperator): # pylint: disable=too-many-instance-attributes """ Execute a task in a Kubernetes Pod + .. note:: + If you use `Google Kubernetes Engine `__, use + :class:`~airflow.gcp.operators.kubernetes_engine.GKEPodOperator`, which + simplifies the authorization process. + :param image: Docker image you wish to launch. Defaults to hub.docker.com, but fully qualified URLS will point to custom repositories. :type image: str - :param namespace: the namespace to run within kubernetes. - :type namespace: str + :param name: name of the pod in which the task will run, will be used (plus a random + suffix) to generate a pod id (DNS-1123 subdomain, containing only [a-z0-9.-]). + :type name: str :param cmds: entrypoint of the container. (templated) The docker images's entrypoint is used if this is not provided. :type cmds: list[str] @@ -50,11 +60,11 @@ class KubernetesPodOperator(BaseOperator): # pylint: disable=too-many-instance- comma separated list: secret_a,secret_b :type image_pull_secrets: str :param ports: ports for launched pod. - :type ports: list[airflow.contrib.kubernetes.pod.Port] + :type ports: list[airflow.kubernetes.pod.Port] :param volume_mounts: volumeMounts for launched pod. - :type volume_mounts: list[airflow.contrib.kubernetes.volume_mount.VolumeMount] + :type volume_mounts: list[airflow.kubernetes.volume_mount.VolumeMount] :param volumes: volumes for launched pod. Includes ConfigMaps and PersistentVolumes. - :type volumes: list[airflow.contrib.kubernetes.volume.Volume] + :type volumes: list[airflow.kubernetes.volume.Volume] :param labels: labels to apply to the Pod. :type labels: dict :param startup_timeout_seconds: timeout in seconds to startup the pod. @@ -65,13 +75,19 @@ class KubernetesPodOperator(BaseOperator): # pylint: disable=too-many-instance- :param env_vars: Environment variables initialized in the container. (templated) :type env_vars: dict :param secrets: Kubernetes secrets to inject in the container. - They can be exposed as environment vars or files in a volume - :type secrets: list[airflow.contrib.kubernetes.secret.Secret] + They can be exposed as environment vars or files in a volume. + :type secrets: list[airflow.kubernetes.secret.Secret] :param in_cluster: run kubernetes client with in_cluster configuration. :type in_cluster: bool :param cluster_context: context that points to kubernetes cluster. Ignored when in_cluster is True. If None, current-context is used. :type cluster_context: str + :param reattach_on_restart: if the scheduler dies while the pod is running, reattach and monitor + :type reattach_on_restart: bool + :param labels: labels to apply to the Pod. + :type labels: dict + :param startup_timeout_seconds: timeout in seconds to startup the pod. + :type startup_timeout_seconds: int :param get_logs: get the stdout of the container as logs of the tasks. :type get_logs: bool :param annotations: non-identifying metadata you can attach to the Pod. @@ -88,6 +104,8 @@ class KubernetesPodOperator(BaseOperator): # pylint: disable=too-many-instance- :param node_selectors: A dict containing a group of scheduling rules. :type node_selectors: dict :param config_file: The path to the Kubernetes config file. (templated) + :param config_file: The path to the Kubernetes config file. (templated) + If not specified, default value is ``~/.kube/config`` :type config_file: str :param do_xcom_push: If do_xcom_push is True, the content of the file /airflow/xcom/return.json in the container will also be pushed to an @@ -106,19 +124,38 @@ class KubernetesPodOperator(BaseOperator): # pylint: disable=too-many-instance- :type configmaps: list[str] :param pod_runtime_info_envs: environment variables about pod runtime information (ip, namespace, nodeName, podName). - :type pod_runtime_info_envs: list[PodRuntimeEnv] + :type pod_runtime_info_envs: list[airflow.kubernetes.pod_runtime_info_env.PodRuntimeInfoEnv] :param security_context: security options the pod should run with (PodSecurityContext). :type security_context: dict :param dnspolicy: dnspolicy for the pod. :type dnspolicy: str + :param schedulername: Specify a schedulername for the pod + :type schedulername: str + :param full_pod_spec: The complete podSpec + :type full_pod_spec: kubernetes.client.models.V1Pod + :param init_containers: init container for the launched Pod + :type init_containers: list[kubernetes.client.models.V1Container] + :param log_events_on_failure: Log the pod's events if a failure occurs + :type log_events_on_failure: bool + :param do_xcom_push: If True, the content of the file + /airflow/xcom/return.json in the container will also be pushed to an + XCom when the container completes. + :type do_xcom_push: bool + :param pod_template_file: path to pod template file + :type pod_template_file: str + :param priority_class_name: priority class name for the launched Pod + :type priority_class_name: str + :param termination_grace_period: Termination grace period if task killed in UI, + defaults to kubernetes default + :type termination_grace_period: int """ - template_fields = ('cmds', 'arguments', 'env_vars', 'config_file') + template_fields = ('image', 'cmds', 'arguments', 'env_vars', 'config_file', 'pod_template_file') @apply_defaults def __init__(self, # pylint: disable=too-many-arguments,too-many-locals - namespace, - image, - name, + namespace=None, + image=None, + name=None, cmds=None, arguments=None, ports=None, @@ -129,6 +166,7 @@ def __init__(self, # pylint: disable=too-many-arguments,too-many-locals in_cluster=None, cluster_context=None, labels=None, + reattach_on_restart=True, startup_timeout_seconds=120, get_logs=True, image_pull_policy='IfNotPresent', @@ -146,17 +184,21 @@ def __init__(self, # pylint: disable=too-many-arguments,too-many-locals security_context=None, pod_runtime_info_envs=None, dnspolicy=None, + schedulername=None, + full_pod_spec=None, + init_containers=None, + log_events_on_failure=False, do_xcom_push=False, + pod_template_file=None, + priority_class_name=None, + termination_grace_period=None, *args, **kwargs): - # https://github.com/apache/airflow/blob/2d0eff4ee4fafcf8c7978ac287a8fb968e56605f/UPDATING.md#unification-of-do_xcom_push-flag if kwargs.get('xcom_push') is not None: - kwargs['do_xcom_push'] = kwargs.pop('xcom_push') - warnings.warn( - "`xcom_push` will be deprecated. Use `do_xcom_push` instead.", - DeprecationWarning, stacklevel=2 - ) + raise AirflowException("'xcom_push' was deprecated, use 'do_xcom_push' instead") super(KubernetesPodOperator, self).__init__(*args, resources=None, **kwargs) + + self.pod = None self.do_xcom_push = do_xcom_push self.image = image self.namespace = namespace @@ -164,7 +206,6 @@ def __init__(self, # pylint: disable=too-many-arguments,too-many-locals self.arguments = arguments or [] self.labels = labels or {} self.startup_timeout_seconds = startup_timeout_seconds - self.name = self._set_name(name) self.env_vars = env_vars or {} self.ports = ports or [] self.volume_mounts = volume_mounts or [] @@ -172,14 +213,16 @@ def __init__(self, # pylint: disable=too-many-arguments,too-many-locals self.secrets = secrets or [] self.in_cluster = in_cluster self.cluster_context = cluster_context + self.reattach_on_restart = reattach_on_restart self.get_logs = get_logs self.image_pull_policy = image_pull_policy self.node_selectors = node_selectors or {} self.annotations = annotations or {} self.affinity = affinity or {} - self.resources = self._set_resources(resources) + self.resources = self._set_resources(resources) # noqa + self.k8s_resources = self.resources self.config_file = config_file - self.image_pull_secrets = image_pull_secrets + self.image_pull_secrets = image_pull_secrets or [] self.service_account_name = service_account_name self.is_delete_operator_pod = is_delete_operator_pod self.hostnetwork = hostnetwork @@ -188,6 +231,39 @@ def __init__(self, # pylint: disable=too-many-arguments,too-many-locals self.security_context = security_context or {} self.pod_runtime_info_envs = pod_runtime_info_envs or [] self.dnspolicy = dnspolicy + self.schedulername = schedulername + self.full_pod_spec = full_pod_spec + self.init_containers = init_containers or [] + self.log_events_on_failure = log_events_on_failure + self.pod_template_file = pod_template_file + self.priority_class_name = priority_class_name + self.name = self._set_name(name) + self.termination_grace_period = termination_grace_period + self.client = None + + @staticmethod + def create_labels_for_pod(context): + """ + Generate labels for the pod to track the pod in case of Operator crash + + :param context: task context provided by airflow DAG + :return: dict + """ + labels = { + 'dag_id': context['dag'].dag_id, + 'task_id': context['task'].task_id, + 'execution_date': context['ts'], + 'try_number': context['ti'].try_number, + } + # In the case of sub dags this is just useful + if context['dag'].is_subdag: + labels['parent_dag_id'] = context['dag'].parent_dag.dag_id + # Ensure that label is valid for Kube, + # and if not truncate/remove invalid chars and replace with short hash. + for label_id, label in labels.items(): + safe_label = pod_generator.make_safe_label_value(str(label)) + labels[label_id] = safe_label + return labels def execute(self, context): try: @@ -199,6 +275,161 @@ def execute(self, context): client = kube_client.get_kube_client(cluster_context=self.cluster_context, config_file=self.config_file) + self.pod = self.create_pod_request_obj() + self.namespace = self.pod.metadata.namespace + + self.client = client + + # Add combination of labels to uniquely identify a running pod + labels = self.create_labels_for_pod(context) + + self.pod = self.create_pod_request_obj() + self.namespace = self.pod.metadata.namespace + + label_selector = self._get_pod_identifying_label_string(labels) + + pod_list = client.list_namespaced_pod(self.namespace, label_selector=label_selector) + + if len(pod_list.items) > 1 and self.reattach_on_restart: + raise AirflowException( + 'More than one pod running with labels: ' + '{label_selector}'.format(label_selector=label_selector)) + + launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.do_xcom_push) + + if len(pod_list.items) == 1: + try_numbers_match = self._try_numbers_match(context, pod_list.items[0]) + final_state, result = self.handle_pod_overlap( + labels, try_numbers_match, launcher, pod_list.items[0] + ) + else: + final_state, _, result = self.create_new_pod_for_operator(labels, launcher) + if final_state != State.SUCCESS: + raise AirflowException( + 'Pod returned a failure: {state}'.format(state=final_state)) + return result + except AirflowException as ex: + raise AirflowException('Pod Launching failed: {error}'.format(error=ex)) + + def handle_pod_overlap(self, labels, try_numbers_match, launcher, pod): + """ + In cases where the Scheduler restarts while a KubernetsPodOperator task is running, + this function will either continue to monitor the existing pod or launch a new pod + based on the `reattach_on_restart` parameter. + :param labels: labels used to determine if a pod is repeated + :type labels: dict + :param try_numbers_match: do the try numbers match? Only needed for logging purposes + :type try_numbers_match: bool + :param launcher: PodLauncher + :param pod: Pod found + """ + if try_numbers_match: + log_line = "found a running pod with labels {} and the same try_number.".format(labels) + else: + log_line = "found a running pod with labels {} but a different try_number.".format(labels) + + # In case of failed pods, should reattach the first time, but only once + # as the task will have already failed. + if self.reattach_on_restart and not pod.metadata.labels.get("already_checked"): + log_line = log_line + " Will attach to this pod and monitor instead of starting new one" + self.log.info(log_line) + self.pod = pod + final_state, result = self.monitor_launched_pod(launcher, pod) + else: + log_line = log_line + "creating pod with labels {} and launcher {}".format(labels, launcher) + self.log.info(log_line) + final_state, _, result = self.create_new_pod_for_operator(labels, launcher) + return final_state, result + + @staticmethod + def _get_pod_identifying_label_string(labels): + filtered_labels = {label_id: label for label_id, label in labels.items() if label_id != 'try_number'} + return ','.join([label_id + '=' + label for label_id, label in sorted(filtered_labels.items())]) + + @staticmethod + def _try_numbers_match(context, pod): + return pod.metadata.labels['try_number'] == context['ti'].try_number + + @staticmethod + def _set_resources(resources): + if not resources: + return [] + return [Resources(**resources)] + + def _set_name(self, name): + if self.pod_template_file or self.full_pod_spec: + return None + validate_key(name, max_length=220) + return re.sub(r'[^a-z0-9.-]+', '-', name.lower()) + + def create_pod_request_obj(self): + """ + Creates a V1Pod based on user parameters. Note that a `pod` or `pod_template_file` + will supersede all other values. + """ + if self.pod_template_file: + pod_template = pod_generator.PodGenerator.deserialize_model_file(self.pod_template_file) + else: + pod_template = k8s.V1Pod(metadata=k8s.V1ObjectMeta(name="name")) + + pod = pod_generator.PodGenerator( + image=self.image, + namespace=self.namespace, + cmds=self.cmds, + args=self.arguments, + labels=self.labels, + name=self.name, + envs=self.env_vars, + extract_xcom=self.do_xcom_push, + image_pull_policy=self.image_pull_policy, + node_selectors=self.node_selectors, + annotations=self.annotations, + affinity=self.affinity, + image_pull_secrets=self.image_pull_secrets, + service_account_name=self.service_account_name, + hostnetwork=self.hostnetwork, + tolerations=self.tolerations, + security_context=self.security_context, + dnspolicy=self.dnspolicy, + init_containers=self.init_containers, + restart_policy='Never', + schedulername=self.schedulername, + priority_class_name=self.priority_class_name, + ).gen_pod() + + # noinspection PyTypeChecker + pod = append_to_pod( + pod, + self.pod_runtime_info_envs + # type: ignore + self.ports + # type: ignore + self.resources + # type: ignore + self.secrets + # type: ignore + self.volumes + # type: ignore + self.volume_mounts # type: ignore + ) + + env_from = pod.spec.containers[0].env_from or [] + for configmap in self.configmaps: + env_from.append(k8s.V1EnvFromSource(config_map_ref=k8s.V1ConfigMapEnvSource(name=configmap))) + pod.spec.containers[0].env_from = env_from + + if self.full_pod_spec: + pod_template = PodGenerator.reconcile_pods(pod_template, self.full_pod_spec) + pod = PodGenerator.reconcile_pods(pod_template, pod) + + # if self.do_xcom_push: + # pod = PodGenerator.add_sidecar(pod) + return pod + + def create_new_pod_for_operator(self, labels, launcher): + """ + Creates a new pod and monitors for duration of task + + @param labels: labels used to track pod + @param launcher: pod launcher that will manage launching and monitoring pods + @return: + """ + if not (self.full_pod_spec or self.pod_template_file): # Add Airflow Version to the label # And a label to identify that pod is launched by KubernetesPodOperator self.labels.update( @@ -207,64 +438,62 @@ def execute(self, context): 'kubernetes_pod_operator': 'True', } ) + self.labels.update(labels) + self.pod.metadata.labels = self.labels + self.log.debug("Starting pod:\n%s", yaml.safe_dump(self.pod.to_dict())) - gen = pod_generator.PodGenerator() - - for port in self.ports: - gen.add_port(port) - for mount in self.volume_mounts: - gen.add_mount(mount) - for volume in self.volumes: - gen.add_volume(volume) - - pod = gen.make_pod( - namespace=self.namespace, - image=self.image, - pod_id=self.name, - cmds=self.cmds, - arguments=self.arguments, - labels=self.labels, - ) - - pod.service_account_name = self.service_account_name - pod.secrets = self.secrets - pod.envs = self.env_vars - pod.image_pull_policy = self.image_pull_policy - pod.image_pull_secrets = self.image_pull_secrets - pod.annotations = self.annotations - pod.resources = self.resources - pod.affinity = self.affinity - pod.node_selectors = self.node_selectors - pod.hostnetwork = self.hostnetwork - pod.tolerations = self.tolerations - pod.configmaps = self.configmaps - pod.security_context = self.security_context - pod.pod_runtime_info_envs = self.pod_runtime_info_envs - pod.dnspolicy = self.dnspolicy - - launcher = pod_launcher.PodLauncher(kube_client=client, - extract_xcom=self.do_xcom_push) - try: - (final_state, result) = launcher.run_pod( - pod, - startup_timeout=self.startup_timeout_seconds, - get_logs=self.get_logs) - finally: - if self.is_delete_operator_pod: - launcher.delete_pod(pod) - - if final_state != State.SUCCESS: - raise AirflowException( - 'Pod returned a failure: {state}'.format(state=final_state) - ) - if self.do_xcom_push: - return result + try: + launcher.start_pod( + self.pod, + startup_timeout=self.startup_timeout_seconds) + final_state, result = launcher.monitor_pod(pod=self.pod, get_logs=self.get_logs) except AirflowException as ex: + if self.log_events_on_failure: + for event in launcher.read_pod_events(self.pod).items: + self.log.error("Pod Event: %s - %s", event.reason, event.message) raise AirflowException('Pod Launching failed: {error}'.format(error=ex)) + finally: + if self.is_delete_operator_pod: + launcher.delete_pod(self.pod) + return final_state, self.pod, result - def _set_resources(self, resources): - return Resources(**resources) if resources else Resources() + def patch_already_checked(self, pod): + """ + Add an "already tried annotation to ensure we only retry once + """ + pod.metadata.labels["already_checked"] = "True" + body = PodGenerator.serialize_pod(pod) + self.client.patch_namespaced_pod(pod.metadata.name, pod.metadata.namespace, body) - def _set_name(self, name): - validate_key(name, max_length=63) - return re.sub(r'[^a-z0-9.-]+', '-', name.lower()) + def monitor_launched_pod(self, launcher, pod): + """ + Monitors a pod to completion that was created by a previous KubernetesPodOperator + + :param launcher: pod launcher that will manage launching and monitoring pods + :param pod: podspec used to find pod using k8s API + :return: + """ + try: + (final_state, result) = launcher.monitor_pod(pod, get_logs=self.get_logs) + finally: + if self.is_delete_operator_pod: + launcher.delete_pod(pod) + if final_state != State.SUCCESS: + if self.log_events_on_failure: + for event in launcher.read_pod_events(pod).items: + self.log.error("Pod Event: %s - %s", event.reason, event.message) + self.patch_already_checked(self.pod) + raise AirflowException( + 'Pod returned a failure: {state}'.format(state=final_state) + ) + return final_state, result + + def on_kill(self): + if self.pod: + pod = self.pod + namespace = pod.metadata.namespace + name = pod.metadata.name + kwargs = {} + if self.termination_grace_period is not None: + kwargs = {"grace_period_seconds": self.termination_grace_period} + self.client.delete_namespaced_pod(name=name, namespace=namespace, **kwargs) diff --git a/airflow/contrib/operators/mlengine_operator.py b/airflow/contrib/operators/mlengine_operator.py index d3f36ef13bf38..687cff3ea7854 100644 --- a/airflow/contrib/operators/mlengine_operator.py +++ b/airflow/contrib/operators/mlengine_operator.py @@ -15,7 +15,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +import logging import re from googleapiclient.errors import HttpError @@ -24,9 +24,8 @@ from airflow.exceptions import AirflowException from airflow.operators import BaseOperator from airflow.utils.decorators import apply_defaults -from airflow.utils.log.logging_mixin import LoggingMixin -log = LoggingMixin().log +log = logging.getLogger(__name__) def _normalize_mlengine_job_id(job_id): diff --git a/airflow/contrib/operators/qubole_check_operator.py b/airflow/contrib/operators/qubole_check_operator.py index 0286763198f15..2e8bfa4063ec3 100644 --- a/airflow/contrib/operators/qubole_check_operator.py +++ b/airflow/contrib/operators/qubole_check_operator.py @@ -223,6 +223,6 @@ def handle_airflow_exception(airflow_exception, hook): '\nQubole Command Results:' \ '\n{qubole_command_results}'.format( qubole_command_id=qubole_command_id, # noqa: E122 - qubole_command_results=qubole_command_results) + qubole_command_results=qubole_command_results) # noqa: E122 raise AirflowException(str(airflow_exception) + exception_message) raise AirflowException(str(airflow_exception)) diff --git a/airflow/contrib/operators/slack_webhook_operator.py b/airflow/contrib/operators/slack_webhook_operator.py index 6169524fa43c4..3b1bcfaa18399 100644 --- a/airflow/contrib/operators/slack_webhook_operator.py +++ b/airflow/contrib/operators/slack_webhook_operator.py @@ -57,10 +57,12 @@ class SlackWebhookOperator(SimpleHttpOperator): :type link_names: bool :param proxy: Proxy to use to make the Slack webhook call :type proxy: str + :param extra_options: Extra options for http hook + :type extra_options: dict """ template_fields = ['webhook_token', 'message', 'attachments', 'blocks', 'channel', - 'username', 'proxy', ] + 'username', 'proxy', 'extra_options', ] @apply_defaults def __init__(self, @@ -74,6 +76,7 @@ def __init__(self, icon_emoji=None, icon_url=None, link_names=False, + extra_options=None, proxy=None, *args, **kwargs): @@ -92,6 +95,7 @@ def __init__(self, self.link_names = link_names self.proxy = proxy self.hook = None + self.extra_options = extra_options def execute(self, context): """ @@ -108,6 +112,7 @@ def execute(self, context): self.icon_emoji, self.icon_url, self.link_names, - self.proxy + self.proxy, + self.extra_options ) self.hook.execute() diff --git a/airflow/contrib/operators/snowflake_operator.py b/airflow/contrib/operators/snowflake_operator.py index f115fc3701d17..caea8190ae461 100644 --- a/airflow/contrib/operators/snowflake_operator.py +++ b/airflow/contrib/operators/snowflake_operator.py @@ -42,6 +42,14 @@ class SnowflakeOperator(BaseOperator): :type schema: str :param role: name of role (will overwrite any role defined in connection's extra JSON) + :param authenticator: authenticator for Snowflake. + 'snowflake' (default) to use the internal Snowflake authenticator + 'externalbrowser' to authenticate using your web browser and + Okta, ADFS or any other SAML 2.0-compliant identify provider + (IdP) that has been defined for your account + 'https://.okta.com' to authenticate + through native Okta. + :type authenticator: str """ template_fields = ('sql',) @@ -52,7 +60,7 @@ class SnowflakeOperator(BaseOperator): def __init__( self, sql, snowflake_conn_id='snowflake_default', parameters=None, autocommit=True, warehouse=None, database=None, role=None, - schema=None, *args, **kwargs): + schema=None, authenticator=None, *args, **kwargs): super(SnowflakeOperator, self).__init__(*args, **kwargs) self.snowflake_conn_id = snowflake_conn_id self.sql = sql @@ -62,11 +70,12 @@ def __init__( self.database = database self.role = role self.schema = schema + self.authenticator = authenticator def get_hook(self): return SnowflakeHook(snowflake_conn_id=self.snowflake_conn_id, warehouse=self.warehouse, database=self.database, - role=self.role, schema=self.schema) + role=self.role, schema=self.schema, authenticator=self.authenticator) def execute(self, context): self.log.info('Executing: %s', self.sql) diff --git a/airflow/contrib/operators/spark_submit_operator.py b/airflow/contrib/operators/spark_submit_operator.py index 21f6b4ec3df06..caee335976224 100644 --- a/airflow/contrib/operators/spark_submit_operator.py +++ b/airflow/contrib/operators/spark_submit_operator.py @@ -121,7 +121,7 @@ def __init__(self, application_args=None, env_vars=None, verbose=False, - spark_binary="spark-submit", + spark_binary=None, *args, **kwargs): super(SparkSubmitOperator, self).__init__(*args, **kwargs) diff --git a/airflow/contrib/operators/ssh_operator.py b/airflow/contrib/operators/ssh_operator.py index 445f7b4d8526f..1322b4f04e910 100644 --- a/airflow/contrib/operators/ssh_operator.py +++ b/airflow/contrib/operators/ssh_operator.py @@ -43,7 +43,7 @@ class SSHOperator(BaseOperator): :type remote_host: str :param command: command to execute on remote host. (templated) :type command: str - :param timeout: timeout (in seconds) for executing the command. + :param timeout: timeout (in seconds) for executing the command. The default is 10 seconds. :type timeout: int :param environment: a dict of shell environment variables. Note that the server will reject them silently if `AcceptEnv` is not set in SSH config. diff --git a/airflow/contrib/plugins/metastore_browser/main.py b/airflow/contrib/plugins/metastore_browser/main.py index b0643ac9721f9..d5a68a2fc2f69 100644 --- a/airflow/contrib/plugins/metastore_browser/main.py +++ b/airflow/contrib/plugins/metastore_browser/main.py @@ -20,7 +20,7 @@ from datetime import datetime import json -from flask import Blueprint, request +from flask import Blueprint, Markup, request from flask_admin import BaseView, expose import pandas as pd @@ -35,8 +35,8 @@ PRESTO_CONN_ID = 'presto_default' HIVE_CLI_CONN_ID = 'hive_default' DEFAULT_DB = 'default' -DB_WHITELIST = None -DB_BLACKLIST = ['tmp'] +DB_ALLOW_LIST = None +DB_DENY_LIST = ['tmp'] TABLE_SELECTOR_LIMIT = 2000 # Keeping pandas from truncating long strings @@ -67,7 +67,7 @@ def index(self): escape=False, na_rep='',) return self.render( - "metastore_browser/dbs.html", table=table) + "metastore_browser/dbs.html", table=Markup(table)) @expose('/table/') def table(self): @@ -118,11 +118,11 @@ def partitions(self): @expose('/objects/') def objects(self): where_clause = '' - if DB_WHITELIST: - dbs = ",".join(["'" + db + "'" for db in DB_WHITELIST]) + if DB_ALLOW_LIST: + dbs = ",".join(["'" + db + "'" for db in DB_ALLOW_LIST]) where_clause = "AND b.name IN ({})".format(dbs) - if DB_BLACKLIST: - dbs = ",".join(["'" + db + "'" for db in DB_BLACKLIST]) + if DB_DENY_LIST: + dbs = ",".join(["'" + db + "'" for db in DB_DENY_LIST]) where_clause = "AND b.name NOT IN ({})".format(dbs) sql = """ SELECT CONCAT(b.NAME, '.', a.TBL_NAME), TBL_TYPE diff --git a/airflow/contrib/plugins/metastore_browser/templates/metastore_browser/dbs.html b/airflow/contrib/plugins/metastore_browser/templates/metastore_browser/dbs.html index 9555a023e9452..6a6e187a1af86 100644 --- a/airflow/contrib/plugins/metastore_browser/templates/metastore_browser/dbs.html +++ b/airflow/contrib/plugins/metastore_browser/templates/metastore_browser/dbs.html @@ -23,5 +23,5 @@

Hive Databases

- {{ table|safe }} + {{ table }} {% endblock %} diff --git a/airflow/contrib/secrets/aws_secrets_manager.py b/airflow/contrib/secrets/aws_secrets_manager.py new file mode 100644 index 0000000000000..4df9bdbf87ef6 --- /dev/null +++ b/airflow/contrib/secrets/aws_secrets_manager.py @@ -0,0 +1,145 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Objects relating to sourcing secrets from AWS Secrets Manager +""" + +from typing import Optional + +import boto3 +from cached_property import cached_property + +from airflow.secrets import BaseSecretsBackend +from airflow.utils.log.logging_mixin import LoggingMixin + + +class SecretsManagerBackend(BaseSecretsBackend, LoggingMixin): + """ + Retrieves Connection or Variables from AWS Secrets Manager + + Configurable via ``airflow.cfg`` like so: + + .. code-block:: ini + + [secrets] + backend = airflow.contrib.secrets.aws_secrets_manager.SecretsManagerBackend + backend_kwargs = {"connections_prefix": "airflow/connections"} + + For example, if secrets prefix is ``airflow/connections/smtp_default``, this would be accessible + if you provide ``{"connections_prefix": "airflow/connections"}`` and request conn_id ``smtp_default``. + If variables prefix is ``airflow/variables/hello``, this would be accessible + if you provide ``{"variables_prefix": "airflow/variables"}`` and request variable key ``hello``. + And if config_prefix is ``airflow/config/sql_alchemy_conn``, this would be accessible + if you provide ``{"config_prefix": "airflow/config"}`` and request config + key ``sql_alchemy_conn``. + + You can also pass additional keyword arguments like ``aws_secret_access_key``, ``aws_access_key_id`` + or ``region_name`` to this class and they would be passed on to Boto3 client. + + :param connections_prefix: Specifies the prefix of the secret to read to get Connections. + :type connections_prefix: str + :param variables_prefix: Specifies the prefix of the secret to read to get Variables. + :type variables_prefix: str + :param config_prefix: Specifies the prefix of the secret to read to get Variables. + :type config_prefix: str + :param profile_name: The name of a profile to use. If not given, then the default profile is used. + :type profile_name: str + :param sep: separator used to concatenate secret_prefix and secret_id. Default: "/" + :type sep: str + """ + + def __init__( + self, + connections_prefix='airflow/connections', # type: str + variables_prefix='airflow/variables', # type: str + config_prefix='airflow/config', # type: str + profile_name=None, # type: Optional[str] + sep="/", # type: str + **kwargs + ): + super(SecretsManagerBackend, self).__init__(**kwargs) + self.connections_prefix = connections_prefix.rstrip("/") + self.variables_prefix = variables_prefix.rstrip('/') + self.config_prefix = config_prefix.rstrip('/') + self.profile_name = profile_name + self.sep = sep + self.kwargs = kwargs + + @cached_property + def client(self): + """ + Create a Secrets Manager client + """ + session = boto3.session.Session( + profile_name=self.profile_name, + ) + return session.client(service_name="secretsmanager", **self.kwargs) + + def get_conn_uri(self, conn_id): + # type: (str) -> Optional[str] + """ + Get Connection Value + + :param conn_id: connection id + :type conn_id: str + """ + return self._get_secret(self.connections_prefix, conn_id) + + def get_variable(self, key): + # type: (str) -> Optional[str] + """ + Get Airflow Variable + + :param key: Variable Key + :return: Variable Value + """ + return self._get_secret(self.variables_prefix, key) + + def get_config(self, key): + # type: (str) -> Optional[str] + """ + Get Airflow Configuration + + :param key: Configuration Option Key + :return: Configuration Option Value + """ + return self._get_secret(self.config_prefix, key) + + def _get_secret(self, path_prefix, secret_id): + # type: (str, str) -> Optional[str] + """ + Get secret value from Secrets Manager + + :param path_prefix: Prefix for the Path to get Secret + :type path_prefix: str + :param secret_id: Secret Key + :type secret_id: str + """ + secrets_path = self.build_path(path_prefix, secret_id, self.sep) + try: + response = self.client.get_secret_value( + SecretId=secrets_path, + ) + return response.get('SecretString') + except self.client.exceptions.ResourceNotFoundException: + self.log.debug( + "An error occurred (ResourceNotFoundException) when calling the " + "get_secret_value operation: " + "Secret %s not found.", secrets_path + ) + return None diff --git a/airflow/contrib/secrets/aws_systems_manager.py b/airflow/contrib/secrets/aws_systems_manager.py index 971ad18acaf4e..d85c736e7a89d 100644 --- a/airflow/contrib/secrets/aws_systems_manager.py +++ b/airflow/contrib/secrets/aws_systems_manager.py @@ -43,6 +43,13 @@ class SystemsManagerParameterStoreBackend(BaseSecretsBackend, LoggingMixin): if you provide ``{"connections_prefix": "/airflow/connections"}`` and request conn_id ``smtp_default``. And if ssm path is ``/airflow/variables/hello``, this would be accessible if you provide ``{"variables_prefix": "/airflow/variables"}`` and request conn_id ``hello``. + + :param connections_prefix: Specifies the prefix of the secret to read to get Connections. + :type connections_prefix: str + :param variables_prefix: Specifies the prefix of the secret to read to get Variables. + :type variables_prefix: str + :param profile_name: The name of a profile to use. If not given, then the default profile is used. + :type profile_name: str """ def __init__( @@ -52,10 +59,11 @@ def __init__( profile_name=None, # type: Optional[str] **kwargs ): + super(SystemsManagerParameterStoreBackend, self).__init__(**kwargs) self.connections_prefix = connections_prefix.rstrip("/") self.variables_prefix = variables_prefix.rstrip('/') self.profile_name = profile_name - super(SystemsManagerParameterStoreBackend, self).__init__(**kwargs) + self.kwargs = kwargs @cached_property def client(self): @@ -63,7 +71,7 @@ def client(self): Create a SSM client """ session = boto3.Session(profile_name=self.profile_name) - return session.client("ssm") + return session.client("ssm", **self.kwargs) def get_conn_uri(self, conn_id): # type: (str) -> Optional[str] @@ -74,7 +82,6 @@ def get_conn_uri(self, conn_id): :type conn_id: str :rtype: str """ - return self._get_secret(self.connections_prefix, conn_id) def get_variable(self, key): @@ -100,7 +107,7 @@ def _get_secret(self, path_prefix, secret_id): ssm_path = self.build_path(path_prefix, secret_id) try: response = self.client.get_parameter( - Name=ssm_path, WithDecryption=False + Name=ssm_path, WithDecryption=True ) value = response["Parameter"]["Value"] return value diff --git a/airflow/contrib/secrets/azure_key_vault.py b/airflow/contrib/secrets/azure_key_vault.py new file mode 100644 index 0000000000000..2e48efeab6086 --- /dev/null +++ b/airflow/contrib/secrets/azure_key_vault.py @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import Optional + +from azure.core.exceptions import ResourceNotFoundError +from azure.identity import DefaultAzureCredential +from azure.keyvault.secrets import SecretClient +from cached_property import cached_property + +from airflow.secrets import BaseSecretsBackend +from airflow.utils.log.logging_mixin import LoggingMixin + + +class AzureKeyVaultBackend(BaseSecretsBackend, LoggingMixin): + """ + Retrieves Airflow Connections or Variables from Azure Key Vault secrets. + + The Azure Key Vault can be configured as a secrets backend in the ``airflow.cfg``: + + .. code-block:: ini + + [secrets] + backend = airflow.providers.microsoft.azure.secrets.azure_key_vault.AzureKeyVaultBackend + backend_kwargs = {"connections_prefix": "airflow-connections", "vault_url": ""} + + For example, if the secrets prefix is ``airflow-connections-smtp-default``, this would be accessible + if you provide ``{"connections_prefix": "airflow-connections"}`` and request conn_id ``smtp-default``. + And if variables prefix is ``airflow-variables-hello``, this would be accessible + if you provide ``{"variables_prefix": "airflow-variables"}`` and request variable key ``hello``. + + :param connections_prefix: Specifies the prefix of the secret to read to get Connections + :type connections_prefix: str + :param variables_prefix: Specifies the prefix of the secret to read to get Variables + :type variables_prefix: str + :param config_prefix: Specifies the prefix of the secret to read to get Variables. + :type config_prefix: str + :param vault_url: The URL of an Azure Key Vault to use + :type vault_url: str + :param sep: separator used to concatenate secret_prefix and secret_id. Default: "-" + :type sep: str + """ + + def __init__( + self, + connections_prefix='airflow-connections', # type: str + variables_prefix='airflow-variables', # type: str + config_prefix='airflow-config', # type: str + vault_url='', # type: str + sep='-', # type: str + **kwargs + ): + super(AzureKeyVaultBackend, self).__init__() + self.vault_url = vault_url + self.connections_prefix = connections_prefix.rstrip(sep) + self.variables_prefix = variables_prefix.rstrip(sep) + self.config_prefix = config_prefix.rstrip(sep) + self.sep = sep + self.kwargs = kwargs + + @cached_property + def client(self): + """ + Create a Azure Key Vault client. + """ + credential = DefaultAzureCredential() + client = SecretClient(vault_url=self.vault_url, credential=credential, **self.kwargs) + return client + + def get_conn_uri(self, conn_id): + # type: (str) -> Optional[str] + """ + Get an Airflow Connection URI from an Azure Key Vault secret + + :param conn_id: The Airflow connection id to retrieve + :type conn_id: str + """ + return self._get_secret(self.connections_prefix, conn_id) + + def get_variable(self, key): + # type: (str) -> Optional[str] + """ + Get an Airflow Variable from an Azure Key Vault secret. + + :param key: Variable Key + :type key: str + :return: Variable Value + """ + return self._get_secret(self.variables_prefix, key) + + def get_config(self, key): + # type: (str) -> Optional[str] + """ + Get Airflow Configuration + + :param key: Configuration Option Key + :return: Configuration Option Value + """ + return self._get_secret(self.config_prefix, key) + + @staticmethod + def build_path(path_prefix, secret_id, sep='-'): + # type: (str, str, str) -> str + """ + Given a path_prefix and secret_id, build a valid secret name for the Azure Key Vault Backend. + Also replaces underscore in the path with dashes to support easy switching between + environment variables, so ``connection_default`` becomes ``connection-default``. + + :param path_prefix: The path prefix of the secret to retrieve + :type path_prefix: str + :param secret_id: Name of the secret + :type secret_id: str + :param sep: Separator used to concatenate path_prefix and secret_id + :type sep: str + """ + path = '{}{}{}'.format(path_prefix, sep, secret_id) + return path.replace('_', sep) + + def _get_secret(self, path_prefix, secret_id): + # type: (str, str) -> Optional[str] + """ + Get an Azure Key Vault secret value + + :param path_prefix: Prefix for the Path to get Secret + :type path_prefix: str + :param secret_id: Secret Key + :type secret_id: str + """ + name = self.build_path(path_prefix, secret_id, self.sep) + try: + secret = self.client.get_secret(name=name) + return secret.value + except ResourceNotFoundError as ex: + self.log.debug('Secret %s not found: %s', name, ex) + return None diff --git a/airflow/contrib/secrets/hashicorp_vault.py b/airflow/contrib/secrets/hashicorp_vault.py index 871373357577b..edf48c3d5a21c 100644 --- a/airflow/contrib/secrets/hashicorp_vault.py +++ b/airflow/contrib/secrets/hashicorp_vault.py @@ -55,10 +55,13 @@ class VaultBackend(BaseSecretsBackend, LoggingMixin): :param variables_path: Specifies the path of the secret to read to get Variables. (default: 'variables') :type variables_path: str + :param config_path: Specifies the path of the secret to read Airflow Configurations + (default: 'config'). + :type config_path: str :param url: Base URL for the Vault instance being addressed. :type url: str :param auth_type: Authentication Type for Vault (one of 'token', 'ldap', 'userpass', 'approle', - 'github', 'gcp). Default is ``token``. + 'github', 'gcp', 'kubernetes'). Default is ``token``. :type auth_type: str :param mount_point: The "path" the secret engine was mounted on. (Default: ``secret``) :type mount_point: str @@ -73,6 +76,11 @@ class VaultBackend(BaseSecretsBackend, LoggingMixin): :type password: str :param role_id: Role ID for Authentication (for ``approle`` auth_type) :type role_id: str + :param kubernetes_role: Role for Authentication (for ``kubernetes`` auth_type) + :type kubernetes_role: str + :param kubernetes_jwt_path: Path for kubernetes jwt token (for ``kubernetes`` auth_type, deafult: + ``/var/run/secrets/kubernetes.io/serviceaccount/token``) + :type kubernetes_jwt_path: str :param secret_id: Secret ID for Authentication (for ``approle`` auth_type) :type secret_id: str :param gcp_key_path: Path to GCP Credential JSON file (for ``gcp`` auth_type) @@ -84,6 +92,7 @@ def __init__( # pylint: disable=too-many-arguments self, connections_path='connections', # type: str variables_path='variables', # type: str + config_path='config', # type: str url=None, # type: Optional[str] auth_type='token', # type: str mount_point='secret', # type: str @@ -92,14 +101,17 @@ def __init__( # pylint: disable=too-many-arguments username=None, # type: Optional[str] password=None, # type: Optional[str] role_id=None, # type: Optional[str] + kubernetes_role=None, # type: Optional[str] + kubernetes_jwt_path='/var/run/secrets/kubernetes.io/serviceaccount/token', # type: str secret_id=None, # type: Optional[str] gcp_key_path=None, # type: Optional[str] gcp_scopes=None, # type: Optional[str] **kwargs ): - super(VaultBackend, self).__init__(**kwargs) + super(VaultBackend, self).__init__() self.connections_path = connections_path.rstrip('/') self.variables_path = variables_path.rstrip('/') + self.config_path = config_path.rstrip('/') self.url = url self.auth_type = auth_type self.kwargs = kwargs @@ -107,6 +119,8 @@ def __init__( # pylint: disable=too-many-arguments self.username = username self.password = password self.role_id = role_id + self.kubernetes_role = kubernetes_role + self.kubernetes_jwt_path = kubernetes_jwt_path self.secret_id = secret_id self.mount_point = mount_point self.kv_engine_version = kv_engine_version @@ -132,6 +146,12 @@ def client(self): _client.auth_userpass(username=self.username, password=self.password) elif self.auth_type == "approle": _client.auth_approle(role_id=self.role_id, secret_id=self.secret_id) + elif self.auth_type == "kubernetes": + if not self.kubernetes_role: + raise VaultError("kubernetes_role cannot be None for auth_type='kubernetes'") + with open(self.kubernetes_jwt_path) as f: + jwt = f.read() + _client.auth_kubernetes(role=self.kubernetes_role, jwt=jwt) elif self.auth_type == "github": _client.auth.github.login(token=self.token) elif self.auth_type == "gcp": @@ -164,7 +184,7 @@ def get_conn_uri(self, conn_id): def get_variable(self, key): # type: (str) -> Optional[str] """ - Get Airflow Variable from Environment Variable + Get Airflow Variable :param key: Variable Key :return: Variable Value @@ -198,3 +218,16 @@ def _get_secret(self, path_prefix, secret_id): return_data = response["data"] if self.kv_engine_version == 1 else response["data"]["data"] return return_data + + def get_config(self, key): + # type: (str) -> Optional[str] + """ + Get Airflow Configuration + + :param key: Configuration Option Key + :type key: str + :rtype: str + :return: Configuration Option Value retrieved from the vault + """ + response = self._get_secret(self.config_path, key) + return response.get("value") if response else None diff --git a/airflow/contrib/utils/gcp_field_validator.py b/airflow/contrib/utils/gcp_field_validator.py index 73e37f3e41be9..7780744db749f 100644 --- a/airflow/contrib/utils/gcp_field_validator.py +++ b/airflow/contrib/utils/gcp_field_validator.py @@ -92,7 +92,7 @@ ranges of values), booleans or any other types of fields. * API version: (key="api_version") if API version is specified, then the field will only be validated when api_version used at field validator initialization matches exactly the - the version specified. If you want to declare fields that are available in several + version specified. If you want to declare fields that are available in several versions of the APIs, you should specify the field as many times as many API versions should be supported (each time with different API version). * if none of the keys ("type", "regexp", "custom_validation" - the field is not validated @@ -251,7 +251,7 @@ def _validate_is_empty(full_field_path, value): if not value: raise GcpFieldValidationException( "The body field '{}' can't be empty. Please provide a value." - .format(full_field_path, value)) + .format(full_field_path)) def _validate_dict(self, children_validation_specs, full_field_path, value): # type: (dict, str, dict) -> None diff --git a/airflow/contrib/utils/sendgrid.py b/airflow/contrib/utils/sendgrid.py index a1854679f50ae..be0e2493b466c 100644 --- a/airflow/contrib/utils/sendgrid.py +++ b/airflow/contrib/utils/sendgrid.py @@ -23,6 +23,7 @@ from __future__ import unicode_literals import base64 +import logging import mimetypes import os @@ -31,7 +32,8 @@ Personalization, CustomArg, Category from airflow.utils.email import get_email_address_list -from airflow.utils.log.logging_mixin import LoggingMixin + +log = logging.getLogger(__name__) def send_email(to, subject, html_content, files=None, @@ -104,9 +106,8 @@ def send_email(to, subject, html_content, files=None, def _post_sendgrid_mail(mail_data): - log = LoggingMixin().log - sg = sendgrid.SendGridAPIClient(apikey=os.environ.get('SENDGRID_API_KEY')) - response = sg.client.mail.send.post(request_body=mail_data) + sendgrid_client = sendgrid.SendGridAPIClient(api_key=os.environ.get('SENDGRID_API_KEY')) + response = sendgrid_client.client.mail.send.post(request_body=mail_data) # 2xx status code. if response.status_code >= 200 and response.status_code < 300: log.info('Email with subject %s is successfully sent to recipients: %s' % diff --git a/airflow/dag/base_dag.py b/airflow/dag/base_dag.py index 0e65775d41424..6e556a3a4cd10 100644 --- a/airflow/dag/base_dag.py +++ b/airflow/dag/base_dag.py @@ -63,14 +63,6 @@ def concurrency(self): """ raise NotImplementedError() - @abstractmethod - def is_paused(self): - """ - :return: whether this DAG is paused or not - :rtype: bool - """ - raise NotImplementedError() - @abstractmethod def pickle_id(self): """ diff --git a/airflow/example_dags/example_complex.py b/airflow/example_dags/example_complex.py index ec39eec55e5d0..a2f71f2cad9c9 100644 --- a/airflow/example_dags/example_complex.py +++ b/airflow/example_dags/example_complex.py @@ -78,7 +78,7 @@ ) create_tag_template_field_result2 = BashOperator( - task_id="create_tag_template_field_result", bash_command="echo create_tag_template_field_result" + task_id="create_tag_template_field_result2", bash_command="echo create_tag_template_field_result" ) # Delete diff --git a/airflow/contrib/example_dags/example_kubernetes_executor_config.py b/airflow/example_dags/example_kubernetes_executor_config.py similarity index 96% rename from airflow/contrib/example_dags/example_kubernetes_executor_config.py rename to airflow/example_dags/example_kubernetes_executor_config.py index d7409569e388a..2e4ba00962421 100644 --- a/airflow/contrib/example_dags/example_kubernetes_executor_config.py +++ b/airflow/example_dags/example_kubernetes_executor_config.py @@ -83,14 +83,14 @@ def test_volume_mount(): } ) - # Test that we can run tasks as a normal user + # Test that we can add labels to pods third_task = PythonOperator( task_id="non_root_task", python_callable=print_stuff, executor_config={ "KubernetesExecutor": { - "securityContext": { - "runAsUser": 1000 + "labels": { + "release": "stable" } } } diff --git a/airflow/example_dags/example_latest_only_with_trigger.py b/airflow/example_dags/example_latest_only_with_trigger.py index 59b33d611c505..77fe89ebbc1e2 100644 --- a/airflow/example_dags/example_latest_only_with_trigger.py +++ b/airflow/example_dags/example_latest_only_with_trigger.py @@ -19,6 +19,8 @@ """ Example LatestOnlyOperator and TriggerRule interactions """ + +# [START example] import datetime as dt from airflow.models import DAG @@ -42,3 +44,4 @@ latest_only >> task1 >> [task3, task4] task2 >> [task3, task4] +# [END example] diff --git a/airflow/example_dags/example_subdag_operator.py b/airflow/example_dags/example_subdag_operator.py index c3a2b19f66ec4..d68e60cbcf2e2 100644 --- a/airflow/example_dags/example_subdag_operator.py +++ b/airflow/example_dags/example_subdag_operator.py @@ -19,6 +19,7 @@ """Example DAG demonstrating the usage of the SubDagOperator.""" +# [START example_subdag_operator] from airflow.example_dags.subdags.subdag import subdag from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator @@ -67,3 +68,4 @@ ) start >> section_1 >> some_other_task >> section_2 >> end +# [END example_subdag_operator] diff --git a/airflow/example_dags/example_trigger_target_dag.py b/airflow/example_dags/example_trigger_target_dag.py index c583439efa7c1..2129ea5032257 100644 --- a/airflow/example_dags/example_trigger_target_dag.py +++ b/airflow/example_dags/example_trigger_target_dag.py @@ -66,7 +66,7 @@ def run_this_func(ds, **kwargs): # You can also access the DagRun object in templates bash_task = BashOperator( task_id="bash_task", - bash_command='echo "Here is the message: ' - '{{ dag_run.conf["message"] if dag_run else "" }}" ', + bash_command='echo "Here is the message: $message"', + env={'message': '{{ dag_run.conf["message"] if dag_run else "" }}'}, dag=dag, ) diff --git a/airflow/example_dags/subdags/subdag.py b/airflow/example_dags/subdags/subdag.py index 6a67c7d222865..ff7fee5045a89 100644 --- a/airflow/example_dags/subdags/subdag.py +++ b/airflow/example_dags/subdags/subdag.py @@ -17,6 +17,7 @@ # specific language governing permissions and limitations # under the License. +# [START subdag] from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator @@ -36,3 +37,4 @@ def subdag(parent_dag_name, child_dag_name, args): ) return dag_subdag +# [END subdag] diff --git a/airflow/example_dags/tutorial.py b/airflow/example_dags/tutorial.py index 18685d11e1ce1..8858452e1ab9f 100644 --- a/airflow/example_dags/tutorial.py +++ b/airflow/example_dags/tutorial.py @@ -23,9 +23,9 @@ [here](https://airflow.apache.org/tutorial.html) """ # [START tutorial] +# [START import_module] from datetime import timedelta -# [START import_module] # The DAG object; we'll need this to instantiate a DAG from airflow import DAG # Operators; we need this to operate! diff --git a/airflow/exceptions.py b/airflow/exceptions.py index 5709e25f30ef7..51e3cb4f70939 100644 --- a/airflow/exceptions.py +++ b/airflow/exceptions.py @@ -20,6 +20,10 @@ # Note: Any AirflowException raised is expected to cause the TaskInstance # to be marked in an ERROR state """Exceptions used by Airflow""" +from collections import namedtuple + +from airflow.utils.code_utils import prepare_code_snippet +from airflow.utils.platform import is_tty class AirflowException(Exception): @@ -71,10 +75,18 @@ class AirflowSkipException(AirflowException): """Raise when the task should be skipped""" +class AirflowFailException(AirflowException): + """Raise when the task should be failed without retrying""" + + class AirflowDagCycleException(AirflowException): """Raise when there is a cycle in Dag definition""" +class AirflowClusterPolicyViolation(AirflowException): + """Raise when there is a violation of a Cluster Policy in Dag definition""" + + class DagNotFound(AirflowNotFoundException): """Raise when a DAG is not available in the system""" @@ -117,3 +129,35 @@ class DagConcurrencyLimitReached(AirflowException): class TaskConcurrencyLimitReached(AirflowException): """Raise when task concurrency limit is reached""" + + +file_syntax_error = namedtuple('FileSyntaxError', 'line_no message') +"""Information about a single error in a file.""" + + +class AirflowFileParseException(AirflowException): + """ + Raises when connection or variable file can not be parsed + + :param msg: The human-readable description of the exception + :param file_path: A processed file that contains errors + :param parse_errors: File syntax errors + """ + def __init__(self, msg, file_path, parse_errors): + super(AirflowException, self).__init__(msg) + self.msg = msg + self.file_path = file_path + self.parse_errors = parse_errors + + def __str__(self): + result = self.msg + "\nFilename: " + self.file_path + "\n\n" + + for error_no, parse_error in enumerate(self.parse_errors, 1): + result += "=" * 20 + " Parse error {error_no:3} ".format(error_no=error_no) + "=" * 20 + "\n" + result += parse_error.message + "\n" + if parse_error.line_no: + result += "Line number: {}\n".format(parse_error.line_no) + if parse_error.line_no and is_tty(): + result += "\n" + prepare_code_snippet(self.file_path, parse_error.line_no) + "\n" + + return result diff --git a/airflow/executors/__init__.py b/airflow/executors/__init__.py index e324d4ace248f..948bf338df57b 100644 --- a/airflow/executors/__init__.py +++ b/airflow/executors/__init__.py @@ -16,9 +16,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +import logging import sys -from airflow.utils.log.logging_mixin import LoggingMixin from airflow.configuration import conf from airflow.exceptions import AirflowException from airflow.executors.base_executor import BaseExecutor # noqa @@ -26,6 +25,7 @@ from airflow.executors.sequential_executor import SequentialExecutor DEFAULT_EXECUTOR = None +log = logging.getLogger(__name__) def _integrate_plugins(): @@ -47,7 +47,6 @@ def get_default_executor(): DEFAULT_EXECUTOR = _get_executor(executor_name) - log = LoggingMixin().log log.info("Using executor %s", executor_name) return DEFAULT_EXECUTOR @@ -83,7 +82,7 @@ def _get_executor(executor_name): from airflow.contrib.executors.mesos_executor import MesosExecutor return MesosExecutor() elif executor_name == Executors.KubernetesExecutor: - from airflow.contrib.executors.kubernetes_executor import KubernetesExecutor + from airflow.executors.kubernetes_executor import KubernetesExecutor return KubernetesExecutor() elif executor_name == Executors.DebugExecutor: from airflow.executors.debug_executor import DebugExecutor diff --git a/airflow/executors/celery_executor.py b/airflow/executors/celery_executor.py index 42bf6116a1082..35b4e84d91cce 100644 --- a/airflow/executors/celery_executor.py +++ b/airflow/executors/celery_executor.py @@ -16,7 +16,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""CeleryExecutor +.. seealso:: + For more information on how the CeleryExecutor works, take a look at the guide: + :ref:`executor:CeleryExecutor` +""" +import logging import math import os import subprocess @@ -31,10 +37,11 @@ from airflow.config_templates.default_celery import DEFAULT_CELERY_CONFIG from airflow.exceptions import AirflowException from airflow.executors.base_executor import BaseExecutor -from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.module_loading import import_string from airflow.utils.timeout import timeout +log = logging.getLogger(__name__) + # Make it constant for unit test. CELERY_FETCH_ERR_MSG_HEADER = 'Error fetching Celery task state' @@ -61,7 +68,9 @@ @app.task def execute_command(command_to_exec): - log = LoggingMixin().log + """Executes command.""" + if command_to_exec[0:2] != ["airflow", "run"]: + raise ValueError('The command must start with ["airflow", "run"].') log.info("Executing command in Celery: %s", command_to_exec) env = os.environ.copy() try: @@ -210,7 +219,14 @@ def trigger_tasks(self, open_slots): chunksize = self._num_tasks_per_send_process(len(task_tuples_to_send)) num_processes = min(len(task_tuples_to_send), self._sync_parallelism) - send_pool = Pool(processes=num_processes) + def reset_signals(): + # Since we are run from inside the SchedulerJob, we don't to + # inherit the signal handlers that we registered there. + import signal + signal.signal(signal.SIGINT, signal.SIG_DFL) + signal.signal(signal.SIGTERM, signal.SIG_DFL) + + send_pool = Pool(processes=num_processes, initializer=reset_signals) key_and_async_results = send_pool.map( send_task_to_executor, task_tuples_to_send, @@ -222,7 +238,7 @@ def trigger_tasks(self, open_slots): for key, command, result in key_and_async_results: if isinstance(result, ExceptionWithTraceback): - self.log.error( + self.log.error( # pylint: disable=logging-not-lazy CELERY_SEND_ERR_MSG_HEADER + ":%s\n%s\n", result.exception, result.traceback ) elif result is not None: @@ -261,7 +277,7 @@ def sync(self): for key_and_state in task_keys_to_states: if isinstance(key_and_state, ExceptionWithTraceback): - self.log.error( + self.log.error( # pylint: disable=logging-not-lazy CELERY_FETCH_ERR_MSG_HEADER + ", ignoring it:%s\n%s\n", repr(key_and_state.exception), key_and_state.traceback ) diff --git a/airflow/executors/dask_executor.py b/airflow/executors/dask_executor.py index d322f34bdc829..ea0f397733994 100644 --- a/airflow/executors/dask_executor.py +++ b/airflow/executors/dask_executor.py @@ -16,6 +16,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +""" +DaskExecutor + +.. seealso:: + For more information on how the DaskExecutor works, take a look at the guide: + :ref:`executor:DaskExecutor` +""" import distributed import subprocess @@ -64,6 +71,9 @@ def execute_async(self, key, command, queue=None, executor_config=None): 'All tasks will be run in the same cluster' ) + if command[0:2] != ["airflow", "run"]: + raise ValueError('The command must start with ["airflow", "run"].') + def airflow_run(): return subprocess.check_call(command, close_fds=True) diff --git a/airflow/executors/debug_executor.py b/airflow/executors/debug_executor.py index 37bed09487cf2..7f614378ebc30 100644 --- a/airflow/executors/debug_executor.py +++ b/airflow/executors/debug_executor.py @@ -17,8 +17,11 @@ # specific language governing permissions and limitations # under the License. """ -This module contains DebugExecutor that is a single -process executor meaning it does not use multiprocessing. +DebugExecutor + +.. seealso:: + For more information on how the DebugExecutor works, take a look at the guide: + :ref:`executor:DebugExecutor` """ import threading diff --git a/airflow/executors/kubernetes_executor.py b/airflow/executors/kubernetes_executor.py new file mode 100644 index 0000000000000..73dd91e914456 --- /dev/null +++ b/airflow/executors/kubernetes_executor.py @@ -0,0 +1,903 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +KubernetesExecutor + +.. seealso:: + For more information on how the KubernetesExecutor works, take a look at the guide: + :ref:`executor:KubernetesExecutor` +""" +import base64 +import functools +import json +import multiprocessing +import time +from queue import Empty + +import kubernetes +from dateutil import parser +from kubernetes import watch, client +from kubernetes.client.rest import ApiException +from urllib3.exceptions import HTTPError, ReadTimeoutError + +from airflow import settings +from airflow.configuration import conf +from airflow.exceptions import AirflowConfigException, AirflowException +from airflow.executors.base_executor import BaseExecutor +from airflow.kubernetes import pod_generator +from airflow.kubernetes.kube_client import get_kube_client +from airflow.kubernetes.pod_generator import MAX_POD_ID_LEN +from airflow.kubernetes.pod_generator import PodGenerator +from airflow.kubernetes.pod_launcher import PodLauncher +from airflow.kubernetes.worker_configuration import WorkerConfiguration +from airflow.models import KubeResourceVersion, KubeWorkerIdentifier, TaskInstance +from airflow.utils.db import provide_session, create_session +from airflow.utils.log.logging_mixin import LoggingMixin +from airflow.utils.state import State + + +class KubeConfig: + """Configuration for Kubernetes""" + core_section = 'core' + kubernetes_section = 'kubernetes' + + def __init__(self): + configuration_dict = conf.as_dict(display_sensitive=True) + self.core_configuration = configuration_dict['core'] + self.kube_secrets = configuration_dict.get('kubernetes_secrets', {}) + self.kube_env_vars = configuration_dict.get('kubernetes_environment_variables', {}) + self.env_from_configmap_ref = conf.get(self.kubernetes_section, + 'env_from_configmap_ref') + self.env_from_secret_ref = conf.get(self.kubernetes_section, + 'env_from_secret_ref') + self.airflow_home = settings.AIRFLOW_HOME + self.dags_folder = conf.get(self.core_section, 'dags_folder') + self.parallelism = conf.getint(self.core_section, 'parallelism') + self.worker_container_repository = conf.get( + self.kubernetes_section, 'worker_container_repository') + self.worker_container_tag = conf.get( + self.kubernetes_section, 'worker_container_tag') + self.kube_image = '{}:{}'.format( + self.worker_container_repository, self.worker_container_tag) + self.kube_image_pull_policy = conf.get( + self.kubernetes_section, "worker_container_image_pull_policy" + ) + self.kube_node_selectors = configuration_dict.get('kubernetes_node_selectors', {}) + self.kube_annotations = configuration_dict.get('kubernetes_annotations', {}) or None + self.pod_template_file = conf.get(self.kubernetes_section, 'pod_template_file', + fallback=None) + + self.kube_labels = configuration_dict.get('kubernetes_labels', {}) + self.delete_worker_pods = conf.getboolean( + self.kubernetes_section, 'delete_worker_pods') + self.delete_worker_pods_on_failure = conf.getboolean( + self.kubernetes_section, 'delete_worker_pods_on_failure') + self.worker_pods_creation_batch_size = conf.getint( + self.kubernetes_section, 'worker_pods_creation_batch_size') + self.worker_service_account_name = conf.get( + self.kubernetes_section, 'worker_service_account_name') + self.image_pull_secrets = conf.get(self.kubernetes_section, 'image_pull_secrets') + + # NOTE: user can build the dags into the docker image directly, + # this will set to True if so + self.dags_in_image = conf.getboolean(self.kubernetes_section, 'dags_in_image') + + # Run as user for pod security context + self.worker_run_as_user = self._get_security_context_val('run_as_user') + self.worker_fs_group = self._get_security_context_val('fs_group') + + # NOTE: `git_repo` and `git_branch` must be specified together as a pair + # The http URL of the git repository to clone from + self.git_repo = conf.get(self.kubernetes_section, 'git_repo') + # The branch of the repository to be checked out + self.git_branch = conf.get(self.kubernetes_section, 'git_branch') + # Clone depth for git sync + self.git_sync_depth = conf.get(self.kubernetes_section, 'git_sync_depth') + # Optionally, the directory in the git repository containing the dags + self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath') + # Optionally, the root directory for git operations + self.git_sync_root = conf.get(self.kubernetes_section, 'git_sync_root') + # Optionally, the name at which to publish the checked-out files under --root + self.git_sync_dest = conf.get(self.kubernetes_section, 'git_sync_dest') + # Optionally, the tag or hash to checkout + self.git_sync_rev = conf.get(self.kubernetes_section, 'git_sync_rev') + # Optionally, if git_dags_folder_mount_point is set the worker will use + # {git_dags_folder_mount_point}/{git_sync_dest}/{git_subpath} as dags_folder + self.git_dags_folder_mount_point = conf.get(self.kubernetes_section, + 'git_dags_folder_mount_point') + + # Optionally a user may supply a (`git_user` AND `git_password`) OR + # (`git_ssh_key_secret_name` AND `git_ssh_key_secret_key`) for private repositories + self.git_user = conf.get(self.kubernetes_section, 'git_user') + self.git_password = conf.get(self.kubernetes_section, 'git_password') + self.git_ssh_key_secret_name = conf.get(self.kubernetes_section, 'git_ssh_key_secret_name') + self.git_ssh_known_hosts_configmap_name = conf.get(self.kubernetes_section, + 'git_ssh_known_hosts_configmap_name') + self.git_sync_credentials_secret = conf.get(self.kubernetes_section, + 'git_sync_credentials_secret') + + # NOTE: The user may optionally use a volume claim to mount a PV containing + # DAGs directly + self.dags_volume_claim = conf.get(self.kubernetes_section, 'dags_volume_claim') + + self.dags_volume_mount_point = conf.get(self.kubernetes_section, 'dags_volume_mount_point') + + # This prop may optionally be set for PV Claims and is used to write logs + self.logs_volume_claim = conf.get(self.kubernetes_section, 'logs_volume_claim') + + # This prop may optionally be set for PV Claims and is used to locate DAGs + # on a SubPath + self.dags_volume_subpath = conf.get( + self.kubernetes_section, 'dags_volume_subpath') + + # This prop may optionally be set for PV Claims and is used to locate logs + # on a SubPath + self.logs_volume_subpath = conf.get( + self.kubernetes_section, 'logs_volume_subpath') + + # Optionally, hostPath volume containing DAGs + self.dags_volume_host = conf.get(self.kubernetes_section, 'dags_volume_host') + + # Optionally, write logs to a hostPath Volume + self.logs_volume_host = conf.get(self.kubernetes_section, 'logs_volume_host') + + # This prop may optionally be set for PV Claims and is used to write logs + self.base_log_folder = conf.get(self.core_section, 'base_log_folder') + + # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note + # that if your + # cluster has RBAC enabled, your scheduler may need service account permissions to + # create, watch, get, and delete pods in this namespace. + self.kube_namespace = conf.get(self.kubernetes_section, 'namespace') + self.multi_namespace_mode = conf.getboolean(self.kubernetes_section, 'multi_namespace_mode') + # The Kubernetes Namespace in which pods will be created by the executor. Note + # that if your + # cluster has RBAC enabled, your workers may need service account permissions to + # interact with cluster components. + self.executor_namespace = conf.get(self.kubernetes_section, 'namespace') + # Task secrets managed by KubernetesExecutor. + self.gcp_service_account_keys = conf.get(self.kubernetes_section, + 'gcp_service_account_keys') + + # If the user is using the git-sync container to clone their repository via git, + # allow them to specify repository, tag, and pod name for the init container. + self.git_sync_container_repository = conf.get( + self.kubernetes_section, 'git_sync_container_repository') + + self.git_sync_container_tag = conf.get( + self.kubernetes_section, 'git_sync_container_tag') + self.git_sync_container = '{}:{}'.format( + self.git_sync_container_repository, self.git_sync_container_tag) + + self.git_sync_init_container_name = conf.get( + self.kubernetes_section, 'git_sync_init_container_name') + + self.git_sync_run_as_user = self._get_security_context_val('git_sync_run_as_user') + + # The worker pod may optionally have a valid Airflow config loaded via a + # configmap + self.airflow_configmap = conf.get(self.kubernetes_section, 'airflow_configmap') + + # The worker pod may optionally have a valid Airflow local settings loaded via a + # configmap + self.airflow_local_settings_configmap = conf.get( + self.kubernetes_section, 'airflow_local_settings_configmap') + + affinity_json = conf.get(self.kubernetes_section, 'affinity') + if affinity_json: + self.kube_affinity = json.loads(affinity_json) + else: + self.kube_affinity = None + + tolerations_json = conf.get(self.kubernetes_section, 'tolerations') + if tolerations_json: + self.kube_tolerations = json.loads(tolerations_json) + else: + self.kube_tolerations = None + + kube_client_request_args = conf.get(self.kubernetes_section, 'kube_client_request_args') + if kube_client_request_args: + self.kube_client_request_args = json.loads(kube_client_request_args) + if self.kube_client_request_args['_request_timeout'] and \ + isinstance(self.kube_client_request_args['_request_timeout'], list): + self.kube_client_request_args['_request_timeout'] = \ + tuple(self.kube_client_request_args['_request_timeout']) + else: + self.kube_client_request_args = {} + self._validate() + + delete_option_kwargs = conf.get(self.kubernetes_section, 'delete_option_kwargs') + if delete_option_kwargs: + self.delete_option_kwargs = json.loads(delete_option_kwargs) + else: + self.delete_option_kwargs = {} + + # pod security context items should return integers + # and only return a blank string if contexts are not set. + def _get_security_context_val(self, scontext): + val = conf.get(self.kubernetes_section, scontext) + if not val: + return "" + else: + return int(val) + + def _validate(self): + if self.pod_template_file: + return + # TODO: use XOR for dags_volume_claim and git_dags_folder_mount_point + if not self.dags_volume_claim \ + and not self.dags_volume_host \ + and not self.dags_in_image \ + and (not self.git_repo or not self.git_branch or not self.git_dags_folder_mount_point): + raise AirflowConfigException( + 'In kubernetes mode the following must be set in the `kubernetes` ' + 'config section: `dags_volume_claim` ' + 'or `dags_volume_host` ' + 'or `dags_in_image` ' + 'or `git_repo and git_branch and git_dags_folder_mount_point`') + if self.git_repo \ + and (self.git_user or self.git_password) \ + and self.git_ssh_key_secret_name: + raise AirflowConfigException( + 'In kubernetes mode, using `git_repo` to pull the DAGs: ' + 'for private repositories, either `git_user` and `git_password` ' + 'must be set for authentication through user credentials; ' + 'or `git_ssh_key_secret_name` must be set for authentication ' + 'through ssh key, but not both') + + +class KubernetesJobWatcher(multiprocessing.Process, LoggingMixin): + """Watches for Kubernetes jobs""" + + def __init__(self, + namespace, + multi_namespace_mode, + watcher_queue, + resource_version, + worker_uuid, + kube_config): + multiprocessing.Process.__init__(self) + self.namespace = namespace + self.multi_namespace_mode = multi_namespace_mode + self.worker_uuid = worker_uuid + self.watcher_queue = watcher_queue + self.resource_version = resource_version + self.kube_config = kube_config + + def run(self): + """Performs watching""" + kube_client = get_kube_client() + while True: + try: + self.resource_version = self._run(kube_client, self.resource_version, + self.worker_uuid, self.kube_config) + except ReadTimeoutError: + self.log.warning("There was a timeout error accessing the Kube API. " + "Retrying request.", exc_info=True) + time.sleep(1) + except Exception: + self.log.exception('Unknown error in KubernetesJobWatcher. Failing') + raise + else: + self.log.warning('Watch died gracefully, starting back up with: ' + 'last resource_version: %s', self.resource_version) + + def _run(self, kube_client, resource_version, worker_uuid, kube_config): + self.log.info( + 'Event: and now my watch begins starting at resource_version: %s', + resource_version + ) + watcher = watch.Watch() + + kwargs = {'label_selector': 'airflow-worker={}'.format(worker_uuid)} + if resource_version: + kwargs['resource_version'] = resource_version + if kube_config.kube_client_request_args: + for key, value in kube_config.kube_client_request_args.items(): + kwargs[key] = value + + last_resource_version = None + if self.multi_namespace_mode: + list_worker_pods = functools.partial(watcher.stream, + kube_client.list_pod_for_all_namespaces, + **kwargs) + else: + list_worker_pods = functools.partial(watcher.stream, + kube_client.list_namespaced_pod, + self.namespace, + **kwargs) + for event in list_worker_pods(): + task = event['object'] + self.log.info( + 'Event: %s had an event of type %s', + task.metadata.name, event['type'] + ) + if event['type'] == 'ERROR': + return self.process_error(event) + self.process_status( + pod_id=task.metadata.name, + namespace=task.metadata.namespace, + status=task.status.phase, + labels=task.metadata.labels, + resource_version=task.metadata.resource_version, + event=event, + ) + last_resource_version = task.metadata.resource_version + + return last_resource_version + + def process_error(self, event): + """Process error response""" + self.log.error( + 'Encountered Error response from k8s list namespaced pod stream => %s', + event + ) + raw_object = event['raw_object'] + if raw_object['code'] == 410: + self.log.info( + 'Kubernetes resource version is too old, must reset to 0 => %s', + (raw_object['message'],) + ) + # Return resource version 0 + return '0' + raise AirflowException( + 'Kubernetes failure for %s with code %s and message: %s' % + (raw_object['reason'], raw_object['code'], raw_object['message']) + ) + + def process_status(self, pod_id, namespace, status, labels, resource_version, event): + """Process status response""" + if status == 'Pending': + if event['type'] == 'DELETED': + self.log.info('Event: Failed to start pod %s, will reschedule', pod_id) + self.watcher_queue.put((pod_id, namespace, State.UP_FOR_RESCHEDULE, labels, resource_version)) + else: + self.log.info('Event: %s Pending', pod_id) + elif status == 'Failed': + self.log.info('Event: %s Failed', pod_id) + self.watcher_queue.put((pod_id, namespace, State.FAILED, labels, resource_version)) + elif status == 'Succeeded': + self.log.info('Event: %s Succeeded', pod_id) + self.watcher_queue.put((pod_id, namespace, None, labels, resource_version)) + elif status == 'Running': + self.log.info('Event: %s is Running', pod_id) + else: + self.log.warning( + 'Event: Invalid state: %s on pod: %s in namespace %s with labels: %s with ' + 'resource_version: %s', status, pod_id, namespace, labels, resource_version + ) + + +class AirflowKubernetesScheduler(LoggingMixin): + """Airflow Scheduler for Kubernetes""" + def __init__(self, kube_config, task_queue, result_queue, kube_client, worker_uuid): + self.log.debug("Creating Kubernetes executor") + self.kube_config = kube_config + self.task_queue = task_queue + self.result_queue = result_queue + self.namespace = self.kube_config.kube_namespace + self.log.debug("Kubernetes using namespace %s", self.namespace) + self.kube_client = kube_client + self.launcher = PodLauncher(kube_client=self.kube_client) + self.worker_configuration_pod = WorkerConfiguration(kube_config=self.kube_config).as_pod() + self._manager = multiprocessing.Manager() + self.watcher_queue = self._manager.Queue() + self.worker_uuid = worker_uuid + self.kube_watcher = self._make_kube_watcher() + + def _make_kube_watcher(self): + resource_version = KubeResourceVersion.get_current_resource_version() + watcher = KubernetesJobWatcher(watcher_queue=self.watcher_queue, + namespace=self.kube_config.kube_namespace, + multi_namespace_mode=self.kube_config.multi_namespace_mode, + resource_version=resource_version, + worker_uuid=self.worker_uuid, + kube_config=self.kube_config) + watcher.start() + return watcher + + def _health_check_kube_watcher(self): + if self.kube_watcher.is_alive(): + pass + else: + self.log.error( + 'Error while health checking kube watcher process. ' + 'Process died for unknown reasons') + self.kube_watcher = self._make_kube_watcher() + + def run_next(self, next_job): + """ + The run_next command will check the task_queue for any un-run jobs. + It will then create a unique job-id, launch that job in the cluster, + and store relevant info in the current_jobs map so we can track the job's + status + """ + self.log.info('Kubernetes job is %s', str(next_job)) + key, command, kube_executor_config = next_job + dag_id, task_id, execution_date, try_number = key + + if command[0:2] != ["airflow", "run"]: + raise ValueError('The command must start with ["airflow", "run"].') + + pod = PodGenerator.construct_pod( + namespace=self.namespace, + worker_uuid=self.worker_uuid, + pod_id=self._create_pod_id(dag_id, task_id), + dag_id=pod_generator.make_safe_label_value(dag_id), + task_id=pod_generator.make_safe_label_value(task_id), + try_number=try_number, + kube_image=self.kube_config.kube_image, + date=execution_date, + command=command, + pod_override_object=kube_executor_config, + base_worker_pod=self.worker_configuration_pod + ) + + sanitized_pod = self.launcher._client.api_client.sanitize_for_serialization(pod) + json_pod = json.dumps(sanitized_pod, indent=2) + + self.log.debug('Pod Creation Request before mutation: \n%s', json_pod) + + # Reconcile the pod generated by the Operator and the Pod + # generated by the .cfg file + self.log.debug("Kubernetes running for command %s", command) + self.log.debug("Kubernetes launching image %s", pod.spec.containers[0].image) + + # the watcher will monitor pods, so we do not block. + self.launcher.run_pod_async(pod, **self.kube_config.kube_client_request_args) + self.log.debug("Kubernetes Job created!") + + def delete_pod(self, pod_id, namespace): + """Deletes POD""" + try: + self.kube_client.delete_namespaced_pod( + pod_id, namespace, body=client.V1DeleteOptions(**self.kube_config.delete_option_kwargs), + **self.kube_config.kube_client_request_args) + except ApiException as e: + # If the pod is already deleted + if e.status != 404: + raise + + def sync(self): + """ + The sync function checks the status of all currently running kubernetes jobs. + If a job is completed, it's status is placed in the result queue to + be sent back to the scheduler. + + :return: + + """ + self._health_check_kube_watcher() + while True: + try: + task = self.watcher_queue.get_nowait() + try: + self.process_watcher_task(task) + finally: + self.watcher_queue.task_done() + except Empty: + break + + def process_watcher_task(self, task): + """Process the task by watcher.""" + pod_id, namespace, state, labels, resource_version = task + self.log.info( + 'Attempting to finish pod; pod_id: %s; state: %s; labels: %s', + pod_id, state, labels + ) + key = self._labels_to_key(labels=labels) + if key: + self.log.debug('finishing job %s - %s (%s)', key, state, pod_id) + self.result_queue.put((key, state, pod_id, namespace, resource_version)) + + @staticmethod + def _strip_unsafe_kubernetes_special_chars(string): + """ + Kubernetes only supports lowercase alphanumeric characters and "-" and "." in + the pod name + However, there are special rules about how "-" and "." can be used so let's + only keep + alphanumeric chars see here for detail: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/ + + :param string: The requested Pod name + :return: ``str`` Pod name stripped of any unsafe characters + """ + return ''.join(ch.lower() for ind, ch in enumerate(string) if ch.isalnum()) + + @staticmethod + def _make_safe_pod_id(safe_dag_id, safe_task_id, safe_uuid): + """ + Kubernetes pod names must be <= 253 chars and must pass the following regex for + validation + ``^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`` + + :param safe_dag_id: a dag_id with only alphanumeric characters + :param safe_task_id: a task_id with only alphanumeric characters + :param safe_uuid: a uuid + :return: ``str`` valid Pod name of appropriate length + """ + safe_key = safe_dag_id + safe_task_id + + safe_pod_id = safe_key[:MAX_POD_ID_LEN - len(safe_uuid) - 1] + "-" + safe_uuid + + return safe_pod_id + + @staticmethod + def _create_pod_id(dag_id, task_id): + safe_dag_id = AirflowKubernetesScheduler._strip_unsafe_kubernetes_special_chars( + dag_id) + safe_task_id = AirflowKubernetesScheduler._strip_unsafe_kubernetes_special_chars( + task_id) + return safe_dag_id + safe_task_id + + @staticmethod + def _label_safe_datestring_to_datetime(string): + """ + Kubernetes doesn't permit ":" in labels. ISO datetime format uses ":" but not + "_", let's + replace ":" with "_" + + :param string: str + :return: datetime.datetime object + """ + return parser.parse(string.replace('_plus_', '+').replace("_", ":")) + + @staticmethod + def _datetime_to_label_safe_datestring(datetime_obj): + """ + Kubernetes doesn't like ":" in labels, since ISO datetime format uses ":" but + not "_" let's + replace ":" with "_" + + :param datetime_obj: datetime.datetime object + :return: ISO-like string representing the datetime + """ + return datetime_obj.isoformat().replace(":", "_").replace('+', '_plus_') + + def _labels_to_key(self, labels): + try_num = 1 + try: + try_num = int(labels.get('try_number', '1')) + except ValueError: + self.log.warning("could not get try_number as an int: %s", labels.get('try_number', '1')) + + try: + dag_id = labels['dag_id'] + task_id = labels['task_id'] + ex_time = self._label_safe_datestring_to_datetime(labels['execution_date']) + except Exception as e: + self.log.warning( + 'Error while retrieving labels; labels: %s; exception: %s', + labels, e + ) + return None + + with create_session() as session: + task = ( + session + .query(TaskInstance) + .filter_by(task_id=task_id, dag_id=dag_id, execution_date=ex_time) + .one_or_none() + ) + if task: + self.log.info( + 'Found matching task %s-%s (%s) with current state of %s', + task.dag_id, task.task_id, task.execution_date, task.state + ) + return (dag_id, task_id, ex_time, try_num) + else: + self.log.warning( + 'task_id/dag_id are not safe to use as Kubernetes labels. This can cause ' + 'severe performance regressions. Please see ' + '. ' + 'Given dag_id: %s, task_id: %s', task_id, dag_id + ) + + tasks = ( + session + .query(TaskInstance) + .filter_by(execution_date=ex_time).all() + ) + self.log.info( + 'Checking %s task instances.', + len(tasks) + ) + for task in tasks: + if ( + pod_generator.make_safe_label_value(task.dag_id) == dag_id and + pod_generator.make_safe_label_value(task.task_id) == task_id and + task.execution_date == ex_time + ): + self.log.info( + 'Found matching task %s-%s (%s) with current state of %s', + task.dag_id, task.task_id, task.execution_date, task.state + ) + dag_id = task.dag_id + task_id = task.task_id + return (dag_id, task_id, ex_time, try_num) + self.log.warning( + 'Failed to find and match task details to a pod; labels: %s', + labels + ) + return None + + def _flush_watcher_queue(self): + self.log.debug('Executor shutting down, watcher_queue approx. size=%d', self.watcher_queue.qsize()) + while True: + try: + task = self.watcher_queue.get_nowait() + # Ignoring it since it can only have either FAILED or SUCCEEDED pods + self.log.warning('Executor shutting down, IGNORING watcher task=%s', task) + self.watcher_queue.task_done() + except Empty: + break + + def terminate(self): + """Termninates the watcher.""" + self.log.debug("Terminating kube_watcher...") + self.kube_watcher.terminate() + self.kube_watcher.join() + self.log.debug("kube_watcher=%s", self.kube_watcher) + self.log.debug("Flushing watcher_queue...") + self._flush_watcher_queue() + # Queue should be empty... + self.watcher_queue.join() + self.log.debug("Shutting down manager...") + self._manager.shutdown() + + +class KubernetesExecutor(BaseExecutor, LoggingMixin): + """Executor for Kubernetes""" + def __init__(self): + self.kube_config = KubeConfig() + self.task_queue = None + self.result_queue = None + self.kube_scheduler = None + self.kube_client = None + self.worker_uuid = None + self._manager = multiprocessing.Manager() + super(KubernetesExecutor, self).__init__(parallelism=self.kube_config.parallelism) + + @provide_session + def clear_not_launched_queued_tasks(self, session=None): + """ + If the airflow scheduler restarts with pending "Queued" tasks, the tasks may or + may not + have been launched Thus, on starting up the scheduler let's check every + "Queued" task to + see if it has been launched (ie: if there is a corresponding pod on kubernetes) + + If it has been launched then do nothing, otherwise reset the state to "None" so + the task + will be rescheduled + + This will not be necessary in a future version of airflow in which there is + proper support + for State.LAUNCHED + """ + queued_tasks = session \ + .query(TaskInstance) \ + .filter(TaskInstance.state == State.QUEUED).all() + self.log.info( + 'When executor started up, found %s queued task instances', + len(queued_tasks) + ) + + for task in queued_tasks: + # noinspection PyProtectedMember + # pylint: disable=protected-access + dict_string = ( + "dag_id={},task_id={},execution_date={},airflow-worker={}".format( + pod_generator.make_safe_label_value(task.dag_id), + pod_generator.make_safe_label_value(task.task_id), + AirflowKubernetesScheduler._datetime_to_label_safe_datestring( + task.execution_date + ), + self.worker_uuid + ) + ) + # pylint: enable=protected-access + kwargs = dict(label_selector=dict_string) + if self.kube_config.kube_client_request_args: + for key, value in self.kube_config.kube_client_request_args.items(): + kwargs[key] = value + pod_list = self.kube_client.list_namespaced_pod( + self.kube_config.kube_namespace, **kwargs) + if not pod_list.items: + self.log.info( + 'TaskInstance: %s found in queued state but was not launched, ' + 'rescheduling', task + ) + session.query(TaskInstance).filter( + TaskInstance.dag_id == task.dag_id, + TaskInstance.task_id == task.task_id, + TaskInstance.execution_date == task.execution_date + ).update({TaskInstance.state: State.NONE}) + + def _inject_secrets(self): + def _create_or_update_secret(secret_name, secret_path): + try: + return self.kube_client.create_namespaced_secret( + self.kube_config.executor_namespace, kubernetes.client.V1Secret( + data={ + 'key.json': base64.b64encode(open(secret_path, 'r').read())}, + metadata=kubernetes.client.V1ObjectMeta(name=secret_name)), + **self.kube_config.kube_client_request_args) + except ApiException as e: + if e.status == 409: + return self.kube_client.replace_namespaced_secret( + secret_name, self.kube_config.executor_namespace, + kubernetes.client.V1Secret( + data={'key.json': base64.b64encode( + open(secret_path, 'r').read())}, + metadata=kubernetes.client.V1ObjectMeta(name=secret_name)), + **self.kube_config.kube_client_request_args) + self.log.exception( + 'Exception while trying to inject secret. ' + 'Secret name: %s, error details: %s', + secret_name, e + ) + raise + + # For each GCP service account key, inject it as a secret in executor + # namespace with the specific secret name configured in the airflow.cfg. + # We let exceptions to pass through to users. + if self.kube_config.gcp_service_account_keys: + name_path_pair_list = [ + {'name': account_spec.strip().split('=')[0], + 'path': account_spec.strip().split('=')[1]} + for account_spec in self.kube_config.gcp_service_account_keys.split(',')] + for service_account in name_path_pair_list: + _create_or_update_secret(service_account['name'], service_account['path']) + + def start(self): + """Starts the executor""" + self.log.info('Start Kubernetes executor') + self.worker_uuid = KubeWorkerIdentifier.get_or_create_current_kube_worker_uuid() + self.log.debug('Start with worker_uuid: %s', self.worker_uuid) + # always need to reset resource version since we don't know + # when we last started, note for behavior below + # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs + # /CoreV1Api.md#list_namespaced_pod + KubeResourceVersion.reset_resource_version() + self.task_queue = self._manager.Queue() + self.result_queue = self._manager.Queue() + self.kube_client = get_kube_client() + self.kube_scheduler = AirflowKubernetesScheduler( + self.kube_config, self.task_queue, self.result_queue, + self.kube_client, self.worker_uuid + ) + self._inject_secrets() + self.clear_not_launched_queued_tasks() + + def execute_async(self, key, command, queue=None, executor_config=None): + """Executes task asynchronously""" + self.log.info( + 'Add task %s with command %s with executor_config %s', + key, command, executor_config + ) + + kube_executor_config = PodGenerator.from_obj(executor_config) + self.task_queue.put((key, command, kube_executor_config)) + + def sync(self): + """Synchronize task state.""" + if self.running: + self.log.debug('self.running: %s', self.running) + if self.queued_tasks: + self.log.debug('self.queued: %s', self.queued_tasks) + self.kube_scheduler.sync() + + last_resource_version = None + while True: + try: + results = self.result_queue.get_nowait() + try: + key, state, pod_id, namespace, resource_version = results + last_resource_version = resource_version + self.log.info('Changing state of %s to %s', results, state) + try: + self._change_state(key, state, pod_id, namespace) + except Exception as e: + self.log.exception('Exception: %s when attempting ' + + 'to change state of %s to %s, re-queueing.', e, results, state) + self.result_queue.put(results) + finally: + self.result_queue.task_done() + except Empty: + break + + KubeResourceVersion.checkpoint_resource_version(last_resource_version) + + for _ in range(self.kube_config.worker_pods_creation_batch_size): + try: + task = self.task_queue.get_nowait() + try: + self.kube_scheduler.run_next(task) + except ApiException as e: + self.log.warning('ApiException when attempting to run task, re-queueing. ' + 'Message: %s', json.loads(e.body)['message']) + self.task_queue.put(task) + except HTTPError as e: + self.log.warning('HTTPError when attempting to run task, re-queueing. ' + 'Exception: %s', str(e)) + self.task_queue.put(task) + finally: + self.task_queue.task_done() + except Empty: + break + + def _change_state(self, key, state, pod_id, namespace): + if state != State.RUNNING: + if self.kube_config.delete_worker_pods: + if not self.kube_scheduler: + raise AirflowException("The executor should be started first!") + if state is not State.FAILED or self.kube_config.delete_worker_pods_on_failure: + self.kube_scheduler.delete_pod(pod_id, namespace) + self.log.info('Deleted pod: %s in namespace %s', str(key), str(namespace)) + try: + self.running.pop(key) + except KeyError: + self.log.debug('Could not find key: %s', str(key)) + self.event_buffer[key] = state + + def _flush_task_queue(self): + self.log.debug('Executor shutting down, task_queue approximate size=%d', self.task_queue.qsize()) + while True: + try: + task = self.task_queue.get_nowait() + # This is a new task to run thus ok to ignore. + self.log.warning('Executor shutting down, will NOT run task=%s', task) + self.task_queue.task_done() + except Empty: + break + + def _flush_result_queue(self): + self.log.debug('Executor shutting down, result_queue approximate size=%d', self.result_queue.qsize()) + while True: # pylint: disable=too-many-nested-blocks + try: + results = self.result_queue.get_nowait() + self.log.warning('Executor shutting down, flushing results=%s', results) + try: + key, state, pod_id, namespace, resource_version = results + self.log.info('Changing state of %s to %s : resource_version=%d', results, state, + resource_version) + try: + self._change_state(key, state, pod_id, namespace) + except Exception as e: # pylint: disable=broad-except + self.log.exception('Ignoring exception: %s when attempting to change state of %s ' + 'to %s.', e, results, state) + finally: + self.result_queue.task_done() + except Empty: + break + + def end(self): + """Called when the executor shuts down""" + self.log.info('Shutting down Kubernetes executor') + self.log.debug('Flushing task_queue...') + self._flush_task_queue() + self.log.debug('Flushing result_queue...') + self._flush_result_queue() + # Both queues should be empty... + self.task_queue.join() + self.result_queue.join() + if self.kube_scheduler: + self.kube_scheduler.terminate() + self._manager.shutdown() diff --git a/airflow/executors/local_executor.py b/airflow/executors/local_executor.py index 2c3ba4046c9c0..b4a0fba678f65 100644 --- a/airflow/executors/local_executor.py +++ b/airflow/executors/local_executor.py @@ -17,31 +17,11 @@ # specific language governing permissions and limitations # under the License. """ -LocalExecutor runs tasks by spawning processes in a controlled fashion in different -modes. Given that BaseExecutor has the option to receive a `parallelism` parameter to -limit the number of process spawned, when this parameter is `0` the number of processes -that LocalExecutor can spawn is unlimited. - -The following strategies are implemented: -1. Unlimited Parallelism (self.parallelism == 0): In this strategy, LocalExecutor will -spawn a process every time `execute_async` is called, that is, every task submitted to the -LocalExecutor will be executed in its own process. Once the task is executed and the -result stored in the `result_queue`, the process terminates. There is no need for a -`task_queue` in this approach, since as soon as a task is received a new process will be -allocated to the task. Processes used in this strategy are of class LocalWorker. - -2. Limited Parallelism (self.parallelism > 0): In this strategy, the LocalExecutor spawns -the number of processes equal to the value of `self.parallelism` at `start` time, -using a `task_queue` to coordinate the ingestion of tasks and the work distribution among -the workers, which will take a task as soon as they are ready. During the lifecycle of -the LocalExecutor, the worker processes are running waiting for tasks, once the -LocalExecutor receives the call to shutdown the executor a poison token is sent to the -workers to terminate them. Processes used in this strategy are of class QueuedLocalWorker. - -Arguably, `SequentialExecutor` could be thought as a LocalExecutor with limited -parallelism of just 1 worker, i.e. `self.parallelism = 1`. -This option could lead to the unification of the executor implementations, running -locally, into just one `LocalExecutor` with multiple modes. +LocalExecutor + +.. seealso:: + For more information on how the LocalExecutor works, take a look at the guide: + :ref:`executor:LocalExecutor` """ import multiprocessing @@ -225,6 +205,8 @@ def start(self): self.impl.start() def execute_async(self, key, command, queue=None, executor_config=None): + if command[0:2] != ["airflow", "run"]: + raise ValueError('The command must start with ["airflow", "run"].') self.impl.execute_async(key=key, command=command) def sync(self): diff --git a/airflow/executors/sequential_executor.py b/airflow/executors/sequential_executor.py index 1542e3318eb28..cb5717c9d9001 100644 --- a/airflow/executors/sequential_executor.py +++ b/airflow/executors/sequential_executor.py @@ -17,6 +17,13 @@ # specific language governing permissions and limitations # under the License. +""" +SequentialExecutor + +.. seealso:: + For more information on how the SequentialExecutor works, take a look at the guide: + :ref:`executor:SequentialExecutor` +""" from builtins import str import subprocess @@ -38,6 +45,8 @@ def __init__(self): self.commands_to_run = [] def execute_async(self, key, command, queue=None, executor_config=None): + if command[0:2] != ["airflow", "run"]: + raise ValueError('The command must start with ["airflow", "run"].') self.commands_to_run.append((key, command,)) def sync(self): diff --git a/airflow/hooks/S3_hook.py b/airflow/hooks/S3_hook.py index 1ba69c6c843f7..cce8e5b060742 100644 --- a/airflow/hooks/S3_hook.py +++ b/airflow/hooks/S3_hook.py @@ -361,8 +361,9 @@ def load_file(self, if encrypt: extra_args['ServerSideEncryption'] = "AES256" if gzip: - filename_gz = filename.name + '.gz' - with open(filename.name, 'rb') as f_in: + filename_gz = '' + with open(filename, 'rb') as f_in: + filename_gz = f_in.name + '.gz' with gz.open(filename_gz, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) filename = filename_gz diff --git a/airflow/hooks/__init__.py b/airflow/hooks/__init__.py index 1f1c40f16ae61..745edf8895104 100644 --- a/airflow/hooks/__init__.py +++ b/airflow/hooks/__init__.py @@ -22,6 +22,8 @@ import sys +PY37 = sys.version_info >= (3, 7) + # ------------------------------------------------------------------------ # # #TODO #FIXME Airflow 2.0 @@ -77,6 +79,10 @@ def _integrate_plugins(): from airflow.plugins_manager import hooks_modules for hooks_module in hooks_modules: sys.modules[hooks_module.__name__] = hooks_module + + if not PY37: + from pep562 import Pep562 + hooks_module = Pep562(hooks_module.__name__) globals()[hooks_module._name] = hooks_module ########################################################## diff --git a/airflow/hooks/base_hook.py b/airflow/hooks/base_hook.py index 0d7e39fb62a9c..16a18a8f50dce 100644 --- a/airflow/hooks/base_hook.py +++ b/airflow/hooks/base_hook.py @@ -23,6 +23,7 @@ from __future__ import print_function from __future__ import unicode_literals +import logging import os import random from typing import List @@ -35,6 +36,8 @@ CONN_ENV_PREFIX = 'AIRFLOW_CONN_' +log = logging.getLogger(__name__) + class BaseHook(LoggingMixin): """ @@ -83,7 +86,6 @@ def get_connections(cls, conn_id): # type: (str) -> List[Connection] def get_connection(cls, conn_id): # type: (str) -> Connection conn = random.choice(list(cls.get_connections(conn_id))) if conn.host: - log = LoggingMixin().log log.info("Using connection to: %s", conn.log_info()) return conn diff --git a/airflow/hooks/dbapi_hook.py b/airflow/hooks/dbapi_hook.py index d03d7daa4323f..76f4f0ad63bbf 100644 --- a/airflow/hooks/dbapi_hook.py +++ b/airflow/hooks/dbapi_hook.py @@ -82,7 +82,7 @@ def get_sqlalchemy_engine(self, engine_kwargs=None): engine_kwargs = {} return create_engine(self.get_uri(), **engine_kwargs) - def get_pandas_df(self, sql, parameters=None): + def get_pandas_df(self, sql, parameters=None, **kwargs): """ Executes the sql and returns a pandas dataframe @@ -90,14 +90,16 @@ def get_pandas_df(self, sql, parameters=None): sql statements to execute :type sql: str or list :param parameters: The parameters to render the SQL query with. - :type parameters: mapping or iterable + :type parameters: dict or iterable + :param kwargs: (optional) passed into pandas.io.sql.read_sql method + :type kwargs: dict """ if sys.version_info[0] < 3: sql = sql.encode('utf-8') import pandas.io.sql as psql with closing(self.get_conn()) as conn: - return psql.read_sql(sql, con=conn, params=parameters) + return psql.read_sql(sql, con=conn, params=parameters, **kwargs) def get_records(self, sql, parameters=None): """ @@ -211,8 +213,43 @@ def get_cursor(self): """ return self.get_conn().cursor() + @staticmethod + def _generate_insert_sql(table, values, target_fields, replace, **kwargs): + """ + Static helper method that generate the INSERT SQL statement. + The REPLACE variant is specific to MySQL syntax. + + :param table: Name of the target table + :type table: str + :param values: The row to insert into the table + :type values: tuple of cell values + :param target_fields: The names of the columns to fill in the table + :type target_fields: iterable of strings + :param replace: Whether to replace instead of insert + :type replace: bool + :return: The generated INSERT or REPLACE SQL statement + :rtype: str + """ + placeholders = ["%s", ] * len(values) + + if target_fields: + target_fields = ", ".join(target_fields) + target_fields = "({})".format(target_fields) + else: + target_fields = '' + + if not replace: + sql = "INSERT INTO " + else: + sql = "REPLACE INTO " + sql += "{0} {1} VALUES ({2})".format( + table, + target_fields, + ",".join(placeholders)) + return sql + def insert_rows(self, table, rows, target_fields=None, commit_every=1000, - replace=False): + replace=False, **kwargs): """ A generic way to insert a set of tuples into a table, a new transaction is created every commit_every rows @@ -229,11 +266,6 @@ def insert_rows(self, table, rows, target_fields=None, commit_every=1000, :param replace: Whether to replace instead of insert :type replace: bool """ - if target_fields: - target_fields = ", ".join(target_fields) - target_fields = "({})".format(target_fields) - else: - target_fields = '' i = 0 with closing(self.get_conn()) as conn: if self.supports_autocommit: @@ -247,20 +279,14 @@ def insert_rows(self, table, rows, target_fields=None, commit_every=1000, for cell in row: lst.append(self._serialize_cell(cell, conn)) values = tuple(lst) - placeholders = ["%s", ] * len(values) - if not replace: - sql = "INSERT INTO " - else: - sql = "REPLACE INTO " - sql += "{0} {1} VALUES ({2})".format( - table, - target_fields, - ",".join(placeholders)) + sql = self._generate_insert_sql( + table, values, target_fields, replace, **kwargs + ) cur.execute(sql, values) if commit_every and i % commit_every == 0: conn.commit() self.log.info( - "Loaded %s into %s rows so far", i, table + "Loaded %s rows into %s so far", i, table ) conn.commit() diff --git a/airflow/hooks/hive_hooks.py b/airflow/hooks/hive_hooks.py index ccab93dfad129..48def1164c15a 100644 --- a/airflow/hooks/hive_hooks.py +++ b/airflow/hooks/hive_hooks.py @@ -19,8 +19,10 @@ from __future__ import print_function, unicode_literals +import collections import contextlib import os +import random import re import subprocess import time @@ -181,7 +183,7 @@ def _prepare_hiveconf(d): return [] return as_flattened_list( zip(["-hiveconf"] * len(d), - ["{}={}".format(k, v) for k, v in d.items()]) + ["{}={}".format(k, v) for k, v in collections.OrderedDict(sorted(d.items())).items()]) ) def run_cli(self, hql, schema=None, verbose=True, hive_conf=None): @@ -549,6 +551,7 @@ def sasl_factory(): def _find_valid_server(self): conns = self.get_connections(self.conn_id) + random.shuffle(conns) for conn in conns: host_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.log.info("Trying to connect to %s:%s", conn.host, conn.port) @@ -690,6 +693,7 @@ def _get_max_partition_from_part_specs(part_specs, partition_key, filter_map): pairs will be considered as candidates of max partition. :type filter_map: map :return: Max partition or None if part_specs is empty. + :rtype: basestring """ if not part_specs: return None @@ -713,7 +717,7 @@ def _get_max_partition_from_part_specs(part_specs, partition_key, filter_map): if not candidates: return None else: - return max(candidates).encode('utf-8') + return max(candidates) def max_partition(self, schema, table_name, field=None, filter_map=None): """ @@ -825,6 +829,7 @@ def get_conn(self, schema=None): auth=auth_mechanism, kerberos_service_name=kerberos_service_name, username=db.login or username, + password=db.password, database=schema or db.schema or 'default') def _get_results(self, hql, schema='default', fetch_size=None, hive_conf=None): @@ -978,7 +983,7 @@ def get_records(self, hql, schema='default'): """ return self.get_results(hql, schema=schema)['data'] - def get_pandas_df(self, hql, schema='default'): + def get_pandas_df(self, hql, schema='default', **kwargs): """ Get a pandas dataframe from a Hive query @@ -986,6 +991,8 @@ def get_pandas_df(self, hql, schema='default'): :type hql: str or list :param schema: target schema, default to 'default'. :type schema: str + :param kwargs: (optional) passed into pandas.DataFrame constructor + :type kwargs: dict :return: result of hql execution :rtype: DataFrame @@ -999,6 +1006,6 @@ def get_pandas_df(self, hql, schema='default'): """ import pandas as pd res = self.get_results(hql, schema=schema) - df = pd.DataFrame(res['data']) + df = pd.DataFrame(res['data'], **kwargs) df.columns = [c[0] for c in res['header']] return df diff --git a/airflow/hooks/postgres_hook.py b/airflow/hooks/postgres_hook.py index a6d6523d3e970..4c7a324983bcc 100644 --- a/airflow/hooks/postgres_hook.py +++ b/airflow/hooks/postgres_hook.py @@ -177,3 +177,57 @@ def get_iam_token(self, conn): client = aws_hook.get_client_type('rds') token = client.generate_db_auth_token(conn.host, port, conn.login) return login, token, port + + @staticmethod + def _generate_insert_sql(table, values, target_fields, replace, **kwargs): + """ + Static helper method that generate the INSERT SQL statement. + The REPLACE variant is specific to MySQL syntax. + + :param table: Name of the target table + :type table: str + :param values: The row to insert into the table + :type values: tuple of cell values + :param target_fields: The names of the columns to fill in the table + :type target_fields: iterable of strings + :param replace: Whether to replace instead of insert + :type replace: bool + :param replace_index: the column or list of column names to act as + index for the ON CONFLICT clause + :type replace_index: str or list + :return: The generated INSERT or REPLACE SQL statement + :rtype: str + """ + placeholders = ["%s", ] * len(values) + replace_index = kwargs.get("replace_index", None) + + if target_fields: + target_fields_fragment = ", ".join(target_fields) + target_fields_fragment = "({})".format(target_fields_fragment) + else: + target_fields_fragment = '' + + sql = "INSERT INTO {0} {1} VALUES ({2})".format( + table, + target_fields_fragment, + ",".join(placeholders)) + + if replace: + if target_fields is None: + raise ValueError("PostgreSQL ON CONFLICT upsert syntax requires column names") + if replace_index is None: + raise ValueError("PostgreSQL ON CONFLICT upsert syntax requires an unique index") + if isinstance(replace_index, str): + replace_index = [replace_index] + replace_index_set = set(replace_index) + + replace_target = [ + "{0} = excluded.{0}".format(col) + for col in target_fields + if col not in replace_index_set + ] + sql += " ON CONFLICT ({0}) DO UPDATE SET {1}".format( + ", ".join(replace_index), + ", ".join(replace_target), + ) + return sql diff --git a/airflow/hooks/presto_hook.py b/airflow/hooks/presto_hook.py index 9788411b97d7a..7d700aba8217b 100644 --- a/airflow/hooks/presto_hook.py +++ b/airflow/hooks/presto_hook.py @@ -105,7 +105,7 @@ def get_first(self, hql, parameters=None): except DatabaseError as e: raise PrestoException(self._get_pretty_exception_message(e)) - def get_pandas_df(self, hql, parameters=None): + def get_pandas_df(self, hql, parameters=None, **kwargs): """ Get a pandas dataframe from a sql query. """ @@ -118,10 +118,10 @@ def get_pandas_df(self, hql, parameters=None): raise PrestoException(self._get_pretty_exception_message(e)) column_descriptions = cursor.description if data: - df = pandas.DataFrame(data) + df = pandas.DataFrame(data, **kwargs) df.columns = [c[0] for c in column_descriptions] else: - df = pandas.DataFrame() + df = pandas.DataFrame(**kwargs) return df def run(self, hql, parameters=None): diff --git a/airflow/hooks/webhdfs_hook.py b/airflow/hooks/webhdfs_hook.py index 6d260eeda4689..262f61a597885 100644 --- a/airflow/hooks/webhdfs_hook.py +++ b/airflow/hooks/webhdfs_hook.py @@ -16,20 +16,22 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import logging from hdfs import InsecureClient, HdfsError from airflow.configuration import conf from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook -from airflow.utils.log.logging_mixin import LoggingMixin + + +log = logging.getLogger(__name__) _kerberos_security_mode = conf.get("core", "security") == "kerberos" if _kerberos_security_mode: try: from hdfs.ext.kerberos import KerberosClient except ImportError: - log = LoggingMixin().log log.error("Could not load the Kerberos extension for the WebHDFSHook.") raise diff --git a/airflow/jobs/backfill_job.py b/airflow/jobs/backfill_job.py index 29e426be4c530..b1e0755300ff8 100644 --- a/airflow/jobs/backfill_job.py +++ b/airflow/jobs/backfill_job.py @@ -556,9 +556,8 @@ def _per_task_process(task, key, ti, session=None): open_slots = pool.open_slots(session=session) if open_slots <= 0: raise NoAvailablePoolSlot( - "Not scheduling since there are " - "%s open slots in pool %s".format( - open_slots, task.pool)) + "Not scheduling since there are " # noqa: F523 + "%s open slots in pool %s".format(open_slots, task.pool)) # noqa: F523 num_running_task_instances_in_dag = DAG.get_num_task_instances( self.dag_id, diff --git a/airflow/jobs/base_job.py b/airflow/jobs/base_job.py index 7c0a8471110ed..94f59697568f9 100644 --- a/airflow/jobs/base_job.py +++ b/airflow/jobs/base_job.py @@ -84,7 +84,7 @@ def __init__( *args, **kwargs): self.hostname = get_hostname() self.executor = executor or executors.get_default_executor() - self.executor_class = executor.__class__.__name__ + self.executor_class = self.executor.__class__.__name__ self.start_date = timezone.utcnow() self.latest_heartbeat = timezone.utcnow() if heartrate is not None: @@ -158,7 +158,7 @@ def heartbeat(self): This also allows for any job to be killed externally, regardless of who is running it or on which machine it is running. - Note that if your heartbeat is set to 60 seconds and you call this + Note that if your heart rate is set to 60 seconds and you call this method after 10 seconds of processing since the last heartbeat, it will sleep 50 seconds to complete the 60 seconds and keep a steady heart rate. If you go over 60 seconds before calling it, it won't @@ -175,17 +175,14 @@ def heartbeat(self): if self.state == State.SHUTDOWN: self.kill() - is_unit_test = conf.getboolean('core', 'unit_test_mode') - if not is_unit_test: - # Figure out how long to sleep for - sleep_for = 0 - if self.latest_heartbeat: - seconds_remaining = self.heartrate - \ - (timezone.utcnow() - self.latest_heartbeat)\ - .total_seconds() - sleep_for = max(0, seconds_remaining) - - sleep(sleep_for) + # Figure out how long to sleep for + sleep_for = 0 + if self.latest_heartbeat: + seconds_remaining = self.heartrate - \ + (timezone.utcnow() - self.latest_heartbeat)\ + .total_seconds() + sleep_for = max(0, seconds_remaining) + sleep(sleep_for) # Update last heartbeat time with create_session() as session: diff --git a/airflow/jobs/local_task_job.py b/airflow/jobs/local_task_job.py index 353bf03891e64..9398af7354d53 100644 --- a/airflow/jobs/local_task_job.py +++ b/airflow/jobs/local_task_job.py @@ -24,7 +24,6 @@ import os import signal -import time from airflow.configuration import conf from airflow.exceptions import AirflowException @@ -116,13 +115,6 @@ def signal_handler(signum, frame): "exceeded limit ({}s)." .format(time_since_last_heartbeat, heartbeat_time_limit)) - - if time_since_last_heartbeat < self.heartrate: - sleep_for = self.heartrate - time_since_last_heartbeat - self.log.debug("Time since last heartbeat(%.2f s) < heartrate(%s s)" - ", sleeping for %s s", time_since_last_heartbeat, - self.heartrate, sleep_for) - time.sleep(sleep_for) finally: self.on_kill() diff --git a/airflow/jobs/scheduler_job.py b/airflow/jobs/scheduler_job.py index 6e572a5a437bc..a10c80cc01d33 100644 --- a/airflow/jobs/scheduler_job.py +++ b/airflow/jobs/scheduler_job.py @@ -45,7 +45,7 @@ from airflow.jobs.base_job import BaseJob from airflow.models import DagRun, SlaMiss, errors from airflow.settings import Stats -from airflow.ti_deps.dep_context import DepContext, SCHEDULEABLE_STATES, SCHEDULED_DEPS +from airflow.ti_deps.dep_context import DepContext, SCHEDULED_DEPS from airflow.operators.dummy_operator import DummyOperator from airflow.ti_deps.deps.pool_slots_available_dep import STATES_TO_COUNT_AS_RUNNING from airflow.utils import asciiart, helpers, timezone @@ -57,19 +57,20 @@ list_py_file_paths) from airflow.utils.db import provide_session from airflow.utils.email import get_email_address_list, send_email +from airflow.utils.mixins import MultiprocessingStartMethodMixin from airflow.utils.log.logging_mixin import LoggingMixin, StreamLogWriter, set_context from airflow.utils.state import State -class DagFileProcessor(AbstractDagFileProcessor, LoggingMixin): +class DagFileProcessor(AbstractDagFileProcessor, LoggingMixin, MultiprocessingStartMethodMixin): """Helps call SchedulerJob.process_file() in a separate process. :param file_path: a Python file containing Airflow DAG definitions :type file_path: unicode :param pickle_dags: whether to serialize the DAG objects to the DB :type pickle_dags: bool - :param dag_id_white_list: If specified, only look at these DAG ID's - :type dag_id_white_list: list[unicode] + :param dag_ids: If specified, only look at these DAG ID's + :type dag_ids: list[unicode] :param zombies: zombie task instances to kill :type zombies: list[airflow.utils.dag_processing.SimpleTaskInstance] """ @@ -77,12 +78,12 @@ class DagFileProcessor(AbstractDagFileProcessor, LoggingMixin): # Counter that increments every time an instance of this class is created class_creation_counter = 0 - def __init__(self, file_path, pickle_dags, dag_id_white_list, zombies): + def __init__(self, file_path, pickle_dags, dag_ids, zombies): self._file_path = file_path # The process that was launched to process the given . self._process = None - self._dag_id_white_list = dag_id_white_list + self._dag_ids = dag_ids self._pickle_dags = pickle_dags self._zombies = zombies # The result of Scheduler.process_file(file_path). @@ -104,7 +105,7 @@ def file_path(self): def _run_file_processor(result_channel, file_path, pickle_dags, - dag_id_white_list, + dag_ids, thread_name, zombies): """ @@ -117,9 +118,9 @@ def _run_file_processor(result_channel, :param pickle_dags: whether to pickle the DAGs found in the file and save them to the DB :type pickle_dags: bool - :param dag_id_white_list: if specified, only examine DAG ID's that are + :param dag_ids: if specified, only examine DAG ID's that are in this list - :type dag_id_white_list: list[unicode] + :type dag_ids: list[unicode] :param thread_name: the name to use for the process that is launched :type thread_name: unicode :param zombies: zombie task instances to kill @@ -152,7 +153,7 @@ def _run_file_processor(result_channel, log.info("Started process (PID=%s) to work on %s", os.getpid(), file_path) - scheduler_job = SchedulerJob(dag_ids=dag_id_white_list, log=log) + scheduler_job = SchedulerJob(dag_ids=dag_ids, log=log) result = scheduler_job.process_file(file_path, zombies, pickle_dags) @@ -177,14 +178,20 @@ def start(self): """ Launch the process and start processing the DAG. """ - self._parent_channel, _child_channel = multiprocessing.Pipe() - self._process = multiprocessing.Process( + if six.PY2: + context = multiprocessing + else: + start_method = self._get_multiprocessing_start_method() + context = multiprocessing.get_context(start_method) + + self._parent_channel, _child_channel = context.Pipe() + self._process = context.Process( target=type(self)._run_file_processor, args=( _child_channel, self.file_path, self._pickle_dags, - self._dag_id_white_list, + self._dag_ids, "DagFileProcessor{}".format(self._instance_id), self._zombies ), @@ -215,7 +222,12 @@ def terminate(self, sigkill=False): self._process.terminate() # Arbitrarily wait 5s for the process to die - self._process.join(5) + if six.PY2: + self._process.join(5) + else: + from contextlib import suppress + with suppress(TimeoutError): + self._process._popen.wait(5) # pylint: disable=protected-access if sigkill: self._kill_process() self._parent_channel.close() @@ -379,7 +391,7 @@ def __init__( self.do_pickle = do_pickle super(SchedulerJob, self).__init__(*args, **kwargs) - self.max_threads = conf.getint('scheduler', 'max_threads') + self.max_threads = conf.getint('scheduler', 'parsing_processes') if log: self._log = log @@ -450,7 +462,7 @@ def manage_slas(self, dag, session=None): # This is a temporary fix for 1.10.4 release. # Background: AIRFLOW-4297 # TODO: refactor manage_slas() to handle related issues. - if dag._schedule_interval is None: + if dag.normalized_schedule_interval is None: self.log.info("SLA check for DAGs with schedule_interval 'None'/'@once' are " "skipped in 1.10.4, due to related refactoring going on.") return @@ -668,7 +680,7 @@ def create_dag_run(self, dag, session=None): now = timezone.utcnow() next_start = dag.following_schedule(now) last_start = dag.previous_schedule(now) - if next_start <= now: + if next_start <= now or isinstance(dag.schedule_interval, timedelta): new_start = last_start else: new_start = dag.previous_schedule(last_start) @@ -778,27 +790,11 @@ def _process_task_instances(self, dag, task_instances_list, session=None): run.dag = dag # todo: preferably the integrity check happens at dag collection time run.verify_integrity(session=session) - run.update_state(session=session) + ready_tis = run.update_state(session=session) if run.state == State.RUNNING: - make_transient(run) active_dag_runs.append(run) - - for run in active_dag_runs: - self.log.debug("Examining active DAG run: %s", run) - tis = run.get_task_instances(state=SCHEDULEABLE_STATES) - - # this loop is quite slow as it uses are_dependencies_met for - # every task (in ti.is_runnable). This is also called in - # update_state above which has already checked these tasks - for ti in tis: - task = dag.get_task(ti.task_id) - - # fixme: ti.task is transient but needs to be set - ti.task = task - - if ti.are_dependencies_met( - dep_context=DepContext(flag_upstream_failed=True), - session=session): + self.log.debug("Examining active DAG run: %s", run) + for ti in ready_tis: self.log.debug('Queuing task: %s', ti) task_instances_list.append(ti.key) @@ -962,6 +958,9 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): dag_concurrency_map, task_concurrency_map = self.__get_concurrency_maps( states=STATES_TO_COUNT_AS_RUNNING, session=session) + num_tasks_in_executor = 0 + num_starving_tasks_total = 0 + # Go through each pool, and queue up a task for execution if there are # any open slots in the pool. for pool, task_instances in pool_to_task_instances.items(): @@ -985,9 +984,7 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): priority_sorted_task_instances = sorted( task_instances, key=lambda ti: (-ti.priority_weight, ti.execution_date)) - # Number of tasks that cannot be scheduled because of no open slot in pool num_starving_tasks = 0 - num_tasks_in_executor = 0 for current_index, task_instance in enumerate(priority_sorted_task_instances): if open_slots <= 0: self.log.info( @@ -995,7 +992,9 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): open_slots, pool ) # Can't schedule any more since there are no more open slots. - num_starving_tasks = len(priority_sorted_task_instances) - current_index + num_unhandled = len(priority_sorted_task_instances) - current_index + num_starving_tasks += num_unhandled + num_starving_tasks_total += num_unhandled break # Check to make sure that the task concurrency of the DAG hasn't been @@ -1038,8 +1037,17 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): num_tasks_in_executor += 1 continue + if task_instance.pool_slots > open_slots: + self.log.info("Not executing %s since it requires %s slots " + "but there are %s open slots in the pool %s.", + task_instance, task_instance.pool_slots, open_slots, pool) + num_starving_tasks += 1 + num_starving_tasks_total += 1 + # Though we can execute tasks with lower priority if there's enough room + continue + executable_tis.append(task_instance) - open_slots -= 1 + open_slots -= task_instance.pool_slots dag_concurrency_map[dag_id] += 1 task_concurrency_map[(task_instance.dag_id, task_instance.task_id)] += 1 @@ -1049,10 +1057,11 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): pools[pool_name].open_slots()) Stats.gauge('pool.used_slots.{pool_name}'.format(pool_name=pool_name), pools[pool_name].occupied_slots()) - Stats.gauge('scheduler.tasks.pending', len(task_instances_to_examine)) - Stats.gauge('scheduler.tasks.running', num_tasks_in_executor) - Stats.gauge('scheduler.tasks.starving', num_starving_tasks) - Stats.gauge('scheduler.tasks.executable', len(executable_tis)) + + Stats.gauge('scheduler.tasks.pending', len(task_instances_to_examine)) + Stats.gauge('scheduler.tasks.running', num_tasks_in_executor) + Stats.gauge('scheduler.tasks.starving', num_starving_tasks_total) + Stats.gauge('scheduler.tasks.executable', len(executable_tis)) task_instance_str = "\n\t".join( [repr(x) for x in executable_tis]) @@ -1266,7 +1275,7 @@ def _process_dags(self, dagbag, dags, tis_out): :param dagbag: a collection of DAGs to process :type dagbag: airflow.models.DagBag :param dags: the DAGs from the DagBag to process - :type dags: airflow.models.DAG + :type dags: list[airflow.models.DAG] :param tis_out: A list to add generated TaskInstance objects :type tis_out: list[TaskInstance] :rtype: None @@ -1277,10 +1286,6 @@ def _process_dags(self, dagbag, dags, tis_out): self.log.error("DAG ID %s was not found in the DagBag", dag.dag_id) continue - if dag.is_paused: - self.log.info("Not processing DAG %s since it's paused", dag.dag_id) - continue - self.log.info("Processing %s", dag.dag_id) dag_run = self.create_dag_run(dag) @@ -1359,12 +1364,6 @@ def _execute(self): known_file_paths = list_py_file_paths(self.subdir) self.log.info("There are %s files in %s", len(known_file_paths), self.subdir) - def processor_factory(file_path, zombies): - return DagFileProcessor(file_path, - pickle_dags, - self.dag_ids, - zombies) - # When using sqlite, we do not use async_mode # so the scheduler job and DAG parser don't access the DB at the same time. async_mode = not self.using_sqlite @@ -1374,8 +1373,10 @@ def processor_factory(file_path, zombies): self.processor_agent = DagFileProcessorAgent(self.subdir, known_file_paths, self.num_runs, - processor_factory, + type(self)._create_dag_file_processor, processor_timeout, + self.dag_ids, + pickle_dags, async_mode) try: @@ -1386,6 +1387,16 @@ def processor_factory(file_path, zombies): self.processor_agent.end() self.log.info("Exited execute loop") + @staticmethod + def _create_dag_file_processor(file_path, zombies, dag_ids, pickle_dags): + """ + Creates DagFileProcessorProcess instance. + """ + return DagFileProcessor(file_path, + pickle_dags, + dag_ids, + zombies) + def _get_simple_dags(self): return self.processor_agent.harvest_simple_dags() @@ -1581,8 +1592,7 @@ def process_file(self, file_path, zombies, pickle_dags=False, session=None): for dag in dagbag.dags.values(): dag.sync_to_db() - paused_dag_ids = [dag.dag_id for dag in dagbag.dags.values() - if dag.is_paused] + paused_dag_ids = models.DagModel.get_paused_dag_ids(dag_ids=dagbag.dag_ids) # Pickle the DAGs (if necessary) and put them into a SimpleDag for dag_id in dagbag.dags: @@ -1635,6 +1645,8 @@ def process_file(self, file_path, zombies, pickle_dags=False, session=None): if isinstance(ti.task, DummyOperator) \ and not ti.task.on_success_callback: ti.state = State.SUCCESS + ti.start_date = ti.end_date = timezone.utcnow() + ti.duration = 0 # Also save this task instance to the DB. self.log.info("Creating / updating %s in ORM", ti) diff --git a/airflow/contrib/kubernetes/kubernetes_request_factory/__init__.py b/airflow/kubernetes/__init__.py similarity index 100% rename from airflow/contrib/kubernetes/kubernetes_request_factory/__init__.py rename to airflow/kubernetes/__init__.py diff --git a/airflow/kubernetes/k8s_model.py b/airflow/kubernetes/k8s_model.py new file mode 100644 index 0000000000000..e10a9465d6d45 --- /dev/null +++ b/airflow/kubernetes/k8s_model.py @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Classes for interacting with Kubernetes API +""" + +import abc +import sys +from functools import reduce + +if sys.version_info >= (3, 4): + ABC = abc.ABC +else: + ABC = abc.ABCMeta('ABC', (), {}) + + +class K8SModel(ABC): + + """ + These Airflow Kubernetes models are here for backwards compatibility + reasons only. Ideally clients should use the kubernetes api + and the process of + + client input -> Airflow k8s models -> k8s models + + can be avoided. All of these models implement the + `attach_to_pod` method so that they integrate with the kubernetes client. + """ + + @abc.abstractmethod + def attach_to_pod(self, pod): + """ + :param pod: A pod to attach this Kubernetes object to + :type pod: kubernetes.client.models.V1Pod + :return: The pod with the object attached + """ + + def as_dict(self): + res = {} + if hasattr(self, "__slots__"): + for s in self.__slots__: + if hasattr(self, s): + res[s] = getattr(self, s) + if hasattr(self, "__dict__"): + res_dict = self.__dict__.copy() + res_dict.update(res) + return res_dict + return res + + +def append_to_pod(pod, k8s_objects): + """ + Attach Kubernetes objects to the given POD + + :param pod: A pod to attach a list of Kubernetes objects to + :type pod: kubernetes.client.models.V1Pod + :param k8s_objects: a potential None list of K8SModels + :type k8s_objects: Optional[List[K8SModel]] + :return: pod with the objects attached if they exist + """ + if not k8s_objects: + return pod + new_pod = reduce(lambda p, o: o.attach_to_pod(p), k8s_objects, pod) + return new_pod diff --git a/airflow/kubernetes/kube_client.py b/airflow/kubernetes/kube_client.py new file mode 100644 index 0000000000000..cff817be74d72 --- /dev/null +++ b/airflow/kubernetes/kube_client.py @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Client for kubernetes communication""" + +from typing import Optional + +from airflow.configuration import conf +from six import PY2 + +try: + from kubernetes import config, client + from kubernetes.client.rest import ApiException # pylint: disable=unused-import + from kubernetes.client.api_client import ApiClient + from kubernetes.client import Configuration + from airflow.kubernetes.refresh_config import ( # pylint: disable=ungrouped-imports + load_kube_config, + RefreshConfiguration, + ) + has_kubernetes = True + + def _get_kube_config(in_cluster, # type: bool + cluster_context, # type: Optional[str] + config_file, # type: Optional[str] + ): # type: (...) -> Optional[Configuration] + if in_cluster: + # load_incluster_config set default configuration with config populated by k8s + config.load_incluster_config() + cfg = None + else: + # this block can be replaced with just config.load_kube_config once + # refresh_config module is replaced with upstream fix + cfg = RefreshConfiguration() + load_kube_config( + client_configuration=cfg, config_file=config_file, context=cluster_context) + + if PY2: + # For connect_get_namespaced_pod_exec + configuration = Configuration() + configuration.assert_hostname = False + Configuration.set_default(configuration) + return cfg + + def _get_client_with_patched_configuration(cfg): # type (Optional[Configuration]) -> client.CoreV1Api: + ''' + This is a workaround for supporting api token refresh in k8s client. + + The function can be replace with `return client.CoreV1Api()` once the + upstream client supports token refresh. + ''' + if cfg: + return client.CoreV1Api(api_client=ApiClient(configuration=cfg)) + else: + return client.CoreV1Api() + +except ImportError as e: + # We need an exception class to be able to use it in ``except`` elsewhere + # in the code base + ApiException = BaseException + has_kubernetes = False + _import_err = e + + +def get_kube_client(in_cluster=conf.getboolean('kubernetes', 'in_cluster'), # type: bool + cluster_context=None, # type: Optional[str] + config_file=None, # type: Optional[str] + ): + """ + Retrieves Kubernetes client + + :param in_cluster: whether we are in cluster + :type in_cluster: bool + :param cluster_context: context of the cluster + :type cluster_context: str + :param config_file: configuration file + :type config_file: str + :return kubernetes client + :rtype client.CoreV1Api + """ + + if not has_kubernetes: + raise _import_err + + if not in_cluster: + if cluster_context is None: + cluster_context = conf.get('kubernetes', 'cluster_context', fallback=None) + if config_file is None: + config_file = conf.get('kubernetes', 'config_file', fallback=None) + + client_conf = _get_kube_config(in_cluster, cluster_context, config_file) + return _get_client_with_patched_configuration(client_conf) diff --git a/airflow/kubernetes/pod.py b/airflow/kubernetes/pod.py new file mode 100644 index 0000000000000..67dc98348263d --- /dev/null +++ b/airflow/kubernetes/pod.py @@ -0,0 +1,138 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Classes for interacting with Kubernetes API +""" + +import copy + +from kubernetes.client import models as k8s + +from airflow.kubernetes.k8s_model import K8SModel + + +class Resources(K8SModel): + __slots__ = ('request_memory', + 'request_cpu', + 'limit_memory', + 'limit_cpu', + 'limit_gpu', + 'request_ephemeral_storage', + 'limit_ephemeral_storage') + + """ + :param request_memory: requested memory + :type request_memory: str + :param request_cpu: requested CPU number + :type request_cpu: float | str + :param request_ephemeral_storage: requested ephemeral storage + :type request_ephemeral_storage: str + :param limit_memory: limit for memory usage + :type limit_memory: str + :param limit_cpu: Limit for CPU used + :type limit_cpu: float | str + :param limit_gpu: Limits for GPU used + :type limit_gpu: int + :param limit_ephemeral_storage: Limit for ephemeral storage + :type limit_ephemeral_storage: float | str + """ + + def __init__( + self, + request_memory=None, + request_cpu=None, + request_ephemeral_storage=None, + limit_memory=None, + limit_cpu=None, + limit_gpu=None, + limit_ephemeral_storage=None + ): + self.request_memory = request_memory + self.request_cpu = request_cpu + self.request_ephemeral_storage = request_ephemeral_storage + self.limit_memory = limit_memory + self.limit_cpu = limit_cpu + self.limit_gpu = limit_gpu + self.limit_ephemeral_storage = limit_ephemeral_storage + + def is_empty_resource_request(self): + """Whether resource is empty""" + return not self.has_limits() and not self.has_requests() + + def has_limits(self): + """Whether resource has limits""" + return self.limit_cpu is not None or \ + self.limit_memory is not None or \ + self.limit_gpu is not None or \ + self.limit_ephemeral_storage is not None + + def has_requests(self): + """Whether resource has requests""" + return self.request_cpu is not None or \ + self.request_memory is not None or \ + self.request_ephemeral_storage is not None + + def to_k8s_client_obj(self): + limits_raw = { + 'cpu': self.limit_cpu, + 'memory': self.limit_memory, + 'nvidia.com/gpu': self.limit_gpu, + 'ephemeral-storage': self.limit_ephemeral_storage + } + requests_raw = { + 'cpu': self.request_cpu, + 'memory': self.request_memory, + 'ephemeral-storage': self.request_ephemeral_storage + } + + limits = {k: v for k, v in limits_raw.items() if v} + requests = {k: v for k, v in requests_raw.items() if v} + resource_req = k8s.V1ResourceRequirements( + limits=limits, + requests=requests + ) + return resource_req + + def attach_to_pod(self, pod): + cp_pod = copy.deepcopy(pod) + resources = self.to_k8s_client_obj() + cp_pod.spec.containers[0].resources = resources + return cp_pod + + +class Port(K8SModel): + """POD port""" + __slots__ = ('name', 'container_port') + + def __init__( + self, + name=None, + container_port=None + ): + """Creates port""" + self.name = name + self.container_port = container_port + + def to_k8s_client_obj(self): + return k8s.V1ContainerPort(name=self.name, container_port=self.container_port) + + def attach_to_pod(self, pod): + cp_pod = copy.deepcopy(pod) + port = self.to_k8s_client_obj() + cp_pod.spec.containers[0].ports = cp_pod.spec.containers[0].ports or [] + cp_pod.spec.containers[0].ports.append(port) + return cp_pod diff --git a/airflow/kubernetes/pod_generator.py b/airflow/kubernetes/pod_generator.py new file mode 100644 index 0000000000000..211b0275ff59e --- /dev/null +++ b/airflow/kubernetes/pod_generator.py @@ -0,0 +1,689 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This module provides an interface between the previous Pod +API and outputs a kubernetes.client.models.V1Pod. +The advantage being that the full Kubernetes API +is supported and no serialization need be written. +""" + +import copy +import hashlib +import re +import os +import uuid +from functools import reduce + +import kubernetes.client.models as k8s +import yaml +from dateutil import parser +from kubernetes.client.api_client import ApiClient +from airflow.contrib.kubernetes.pod import _extract_volume_mounts + +from airflow.exceptions import AirflowConfigException +from airflow.version import version as airflow_version + +MAX_LABEL_LEN = 63 + +MAX_POD_ID_LEN = 253 + + +class PodDefaults(object): + """ + Static defaults for the PodGenerator + """ + + def __init__(self): + pass + + XCOM_MOUNT_PATH = '/airflow/xcom' + SIDECAR_CONTAINER_NAME = 'airflow-xcom-sidecar' + XCOM_CMD = 'trap "exit 0" INT; while true; do sleep 30; done;' + VOLUME_MOUNT = k8s.V1VolumeMount( + name='xcom', + mount_path=XCOM_MOUNT_PATH + ) + VOLUME = k8s.V1Volume( + name='xcom', + empty_dir=k8s.V1EmptyDirVolumeSource() + ) + SIDECAR_CONTAINER = k8s.V1Container( + name=SIDECAR_CONTAINER_NAME, + command=['sh', '-c', XCOM_CMD], + image='alpine', + volume_mounts=[VOLUME_MOUNT], + resources=k8s.V1ResourceRequirements( + requests={ + "cpu": "1m", + } + ), + ) + + +def make_safe_label_value(string): + """ + Valid label values must be 63 characters or less and must be empty or begin and + end with an alphanumeric character ([a-z0-9A-Z]) with dashes (-), underscores (_), + dots (.), and alphanumerics between. + + If the label value is greater than 63 chars once made safe, or differs in any + way from the original value sent to this function, then we need to truncate to + 53 chars, and append it with a unique hash. + """ + safe_label = re.sub(r"^[^a-z0-9A-Z]*|[^a-zA-Z0-9_\-\.]|[^a-z0-9A-Z]*$", "", string) + + if len(safe_label) > MAX_LABEL_LEN or string != safe_label: + safe_hash = hashlib.md5(string.encode()).hexdigest()[:9] + safe_label = safe_label[:MAX_LABEL_LEN - len(safe_hash) - 1] + "-" + safe_hash + + return safe_label + + +def datetime_to_label_safe_datestring(datetime_obj): + """ + Kubernetes doesn't like ":" in labels, since ISO datetime format uses ":" but + not "_" let's + replace ":" with "_" + + :param datetime_obj: datetime.datetime object + :return: ISO-like string representing the datetime + """ + return datetime_obj.isoformat().replace(":", "_").replace('+', '_plus_') + + +def label_safe_datestring_to_datetime(string): + """ + Kubernetes doesn't permit ":" in labels. ISO datetime format uses ":" but not + "_", let's + replace ":" with "_" + + :param string: str + :return: datetime.datetime object + """ + return parser.parse(string.replace('_plus_', '+').replace("_", ":")) + + +class PodGenerator(object): + """ + Contains Kubernetes Airflow Worker configuration logic + + Represents a kubernetes pod and manages execution of a single pod. + Any configuration that is container specific gets applied to + the first container in the list of containers. + + Parameters with a type of `kubernetes.client.models.*`/`k8s.*` can + often be replaced with their dictionary equivalent, for example the output of + `sanitize_for_serialization`. + + :param image: The docker image + :type image: Optional[str] + :param name: name in the metadata section (not the container name) + :type name: Optional[str] + :param namespace: pod namespace + :type namespace: Optional[str] + :param volume_mounts: list of kubernetes volumes mounts + :type volume_mounts: Optional[List[Union[k8s.V1VolumeMount, dict]]] + :param envs: A dict containing the environment variables + :type envs: Optional[Dict[str, str]] + :param cmds: The command to be run on the first container + :type cmds: Optional[List[str]] + :param args: The arguments to be run on the pod + :type args: Optional[List[str]] + :param labels: labels for the pod metadata + :type labels: Optional[Dict[str, str]] + :param node_selectors: node selectors for the pod + :type node_selectors: Optional[Dict[str, str]] + :param ports: list of ports. Applies to the first container. + :type ports: Optional[List[Union[k8s.V1ContainerPort, dict]]] + :param volumes: Volumes to be attached to the first container + :type volumes: Optional[List[Union[k8s.V1Volume, dict]]] + :param image_pull_policy: Specify a policy to cache or always pull an image + :type image_pull_policy: str + :param restart_policy: The restart policy of the pod + :type restart_policy: str + :param image_pull_secrets: Any image pull secrets to be given to the pod. + If more than one secret is required, provide a comma separated list: + secret_a,secret_b + :type image_pull_secrets: str + :param init_containers: A list of init containers + :type init_containers: Optional[List[k8s.V1Container]] + :param service_account_name: Identity for processes that run in a Pod + :type service_account_name: Optional[str] + :param resources: Resource requirements for the first containers + :type resources: Optional[Union[k8s.V1ResourceRequirements, dict]] + :param annotations: annotations for the pod + :type annotations: Optional[Dict[str, str]] + :param affinity: A dict containing a group of affinity scheduling rules + :type affinity: Optional[dict] + :param hostnetwork: If True enable host networking on the pod + :type hostnetwork: bool + :param tolerations: A list of kubernetes tolerations + :type tolerations: Optional[list] + :param security_context: A dict containing the security context for the pod + :type security_context: Optional[Union[k8s.V1PodSecurityContext, dict]] + :param configmaps: Any configmap refs to envfrom. + If more than one configmap is required, provide a comma separated list + configmap_a,configmap_b + :type configmaps: str + :param dnspolicy: Specify a dnspolicy for the pod + :type dnspolicy: str + :param schedulername: Specify a schedulername for the pod + :type schedulername: Optional[str] + :param pod: The fully specified pod. Mutually exclusive with `path_or_string` + :type pod: Optional[kubernetes.client.models.V1Pod] + :param pod_template_file: Path to YAML file. Mutually exclusive with `pod` + :type pod_template_file: Optional[str] + :param extract_xcom: Whether to bring up a container for xcom + :type extract_xcom: bool + """ + + def __init__( + self, + image=None, + name=None, + namespace=None, + volume_mounts=None, + envs=None, + cmds=None, + args=None, + labels=None, + node_selectors=None, + ports=None, + volumes=None, + image_pull_policy=None, + restart_policy=None, + image_pull_secrets=None, + init_containers=None, + service_account_name=None, + resources=None, + annotations=None, + affinity=None, + hostnetwork=False, + tolerations=None, + security_context=None, + configmaps=None, + dnspolicy=None, + schedulername=None, + priority_class_name=None, + pod=None, + pod_template_file=None, + extract_xcom=False, + ): + + if pod_template_file: + self.ud_pod = self.deserialize_model_file(pod_template_file) + else: + self.ud_pod = pod + + self.pod = k8s.V1Pod() + self.pod.api_version = 'v1' + self.pod.kind = 'Pod' + + # Pod Metadata + self.metadata = k8s.V1ObjectMeta() + self.metadata.labels = labels + self.metadata.name = name + self.metadata.namespace = namespace + self.metadata.annotations = annotations + + # Pod Container + self.container = k8s.V1Container(name='base') + self.container.image = image + self.container.env = [] + + if envs: + if isinstance(envs, dict): + for key, val in envs.items(): + self.container.env.append(k8s.V1EnvVar( + name=key, + value=val + )) + elif isinstance(envs, list): + self.container.env.extend(envs) + + configmaps = configmaps or [] + self.container.env_from = [] + for configmap in configmaps: + self.container.env_from.append(k8s.V1EnvFromSource( + config_map_ref=k8s.V1ConfigMapEnvSource( + name=configmap + ) + )) + + self.container.command = cmds or [] + self.container.args = args or [] + self.container.image_pull_policy = image_pull_policy + self.container.ports = ports or [] + self.container.resources = resources + self.container.volume_mounts = [v.to_k8s_client_obj() for v in _extract_volume_mounts(volume_mounts)] + + # Pod Spec + self.spec = k8s.V1PodSpec(containers=[]) + self.spec.security_context = security_context + self.spec.tolerations = tolerations + self.spec.dns_policy = dnspolicy + self.spec.scheduler_name = schedulername + self.spec.host_network = hostnetwork + self.spec.affinity = affinity + self.spec.service_account_name = service_account_name + self.spec.init_containers = init_containers + self.spec.volumes = volumes or [] + self.spec.node_selector = node_selectors + self.spec.restart_policy = restart_policy + self.spec.priority_class_name = priority_class_name + self.spec.image_pull_secrets = [] + + if image_pull_secrets: + for image_pull_secret in image_pull_secrets.split(','): + self.spec.image_pull_secrets.append(k8s.V1LocalObjectReference( + name=image_pull_secret + )) + + # Attach sidecar + self.extract_xcom = extract_xcom + + def gen_pod(self): + result = self.ud_pod + + if result is None: + result = self.pod + result.spec = self.spec + result.metadata = self.metadata + result.spec.containers = [self.container] + + result.metadata.name = self.make_unique_pod_id(result.metadata.name) + + if self.extract_xcom: + result = self.add_sidecar(result) + + return result + + @staticmethod + def make_unique_pod_id(dag_id): + """ + Kubernetes pod names must be <= 253 chars and must pass the following regex for + validation + ``^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`` + :param dag_id: a dag_id with only alphanumeric characters + :return: ``str`` valid Pod name of appropriate length + """ + if not dag_id: + return None + + safe_uuid = uuid.uuid4().hex + safe_pod_id = dag_id[:MAX_POD_ID_LEN - len(safe_uuid) - 1] + safe_pod_id = safe_pod_id + "-" + safe_uuid + + return safe_pod_id + + @staticmethod + def add_sidecar(pod): + pod_cp = copy.deepcopy(pod) + pod_cp.spec.volumes = pod.spec.volumes or [] + pod_cp.spec.volumes.insert(0, PodDefaults.VOLUME) + pod_cp.spec.containers[0].volume_mounts = pod_cp.spec.containers[0].volume_mounts or [] + pod_cp.spec.containers[0].volume_mounts.insert(0, PodDefaults.VOLUME_MOUNT) + pod_cp.spec.containers.append(PodDefaults.SIDECAR_CONTAINER) + + return pod_cp + + @staticmethod + def from_obj(obj): + if obj is None: + return None + + if isinstance(obj, PodGenerator): + return obj.gen_pod() + + if not isinstance(obj, dict): + raise TypeError( + 'Cannot convert a non-dictionary or non-PodGenerator ' + 'object into a KubernetesExecutorConfig') + + # We do not want to extract constant here from ExecutorLoader because it is just + # A name in dictionary rather than executor selection mechanism and it causes cyclic import + namespaced = obj.get("KubernetesExecutor", {}) + + if not namespaced: + return None + + resources = namespaced.get('resources') + + if resources is None: + def extract(cpu, memory, ephemeral_storage, limit_gpu=None): + resources_obj = { + 'cpu': namespaced.pop(cpu, None), + 'memory': namespaced.pop(memory, None), + 'ephemeral-storage': namespaced.pop(ephemeral_storage, None), + } + if limit_gpu is not None: + resources_obj['nvidia.com/gpu'] = namespaced.pop(limit_gpu, None) + + resources_obj = {k: v for k, v in resources_obj.items() if v is not None} + + if all(r is None for r in resources_obj): + resources_obj = None + return namespaced, resources_obj + + namespaced, requests = extract('request_cpu', 'request_memory', 'request_ephemeral_storage') + namespaced, limits = extract('limit_cpu', 'limit_memory', 'limit_ephemeral_storage', + limit_gpu='limit_gpu') + + if requests is None and limits is None: + resources = None + else: + resources = k8s.V1ResourceRequirements( + requests=requests, + limits=limits + ) + elif isinstance(resources, dict): + resources = k8s.V1ResourceRequirements( + requests=resources['requests'], + limits=resources['limits'] + ) + + annotations = namespaced.get('annotations', {}) + gcp_service_account_key = namespaced.get('gcp_service_account_key', None) + + if annotations is not None and gcp_service_account_key is not None: + annotations.update({ + 'iam.cloud.google.com/service-account': gcp_service_account_key + }) + + namespaced['resources'] = resources + return PodGenerator(**namespaced).gen_pod() + + @staticmethod + def reconcile_pods(base_pod, client_pod): + """ + :param base_pod: has the base attributes which are overwritten if they exist + in the client pod and remain if they do not exist in the client_pod + :type base_pod: k8s.V1Pod + :param client_pod: the pod that the client wants to create. + :type client_pod: k8s.V1Pod + :return: the merged pods + + This can't be done recursively as certain fields are preserved, + some overwritten, and some concatenated, e.g. The command + should be preserved from base, the volumes appended to and + the other fields overwritten. + """ + if client_pod is None: + return base_pod + + client_pod_cp = copy.deepcopy(client_pod) + client_pod_cp.spec = PodGenerator.reconcile_specs(base_pod.spec, client_pod_cp.spec) + client_pod_cp.metadata = PodGenerator.reconcile_metadata(base_pod.metadata, client_pod_cp.metadata) + client_pod_cp = merge_objects(base_pod, client_pod_cp) + + return client_pod_cp + + @staticmethod + def reconcile_metadata(base_meta, client_meta): + """ + :param base_meta: has the base attributes which are overwritten if they exist + in the client_meta and remain if they do not exist in the client_meta + :type base_meta: k8s.V1ObjectMeta + :param client_meta: the spec that the client wants to create. + :type client_meta: k8s.V1ObjectMeta + :return: the merged specs + """ + if base_meta and not client_meta: + return base_meta + if not base_meta and client_meta: + return client_meta + elif client_meta and base_meta: + client_meta.labels = merge_objects(base_meta.labels, client_meta.labels) + client_meta.annotations = merge_objects(base_meta.annotations, client_meta.annotations) + extend_object_field(base_meta, client_meta, 'managed_fields') + extend_object_field(base_meta, client_meta, 'finalizers') + extend_object_field(base_meta, client_meta, 'owner_references') + return merge_objects(base_meta, client_meta) + + return None + + @staticmethod + def reconcile_specs(base_spec, + client_spec): + """ + :param base_spec: has the base attributes which are overwritten if they exist + in the client_spec and remain if they do not exist in the client_spec + :type base_spec: k8s.V1PodSpec + :param client_spec: the spec that the client wants to create. + :type client_spec: k8s.V1PodSpec + :return: the merged specs + """ + if base_spec and not client_spec: + return base_spec + if not base_spec and client_spec: + return client_spec + elif client_spec and base_spec: + client_spec.containers = PodGenerator.reconcile_containers( + base_spec.containers, client_spec.containers + ) + merged_spec = extend_object_field(base_spec, client_spec, 'volumes') + return merge_objects(base_spec, merged_spec) + + return None + + @staticmethod + def reconcile_containers(base_containers, + client_containers): + """ + :param base_containers: has the base attributes which are overwritten if they exist + in the client_containers and remain if they do not exist in the client_containers + :type base_containers: List[k8s.V1Container] + :param client_containers: the containers that the client wants to create. + :type client_containers: List[k8s.V1Container] + :return: the merged containers + + The runs recursively over the list of containers. + """ + if not base_containers: + return client_containers + if not client_containers: + return base_containers + + client_container = client_containers[0] + base_container = base_containers[0] + client_container = extend_object_field( + base_container, + client_container, + 'volume_mounts', + 'mount_path') + client_container = extend_object_field(base_container, client_container, 'env') + client_container = extend_object_field(base_container, client_container, 'env_from', None) + client_container = extend_object_field(base_container, client_container, 'ports') + client_container = extend_object_field(base_container, client_container, 'volume_devices') + client_container = merge_objects(base_container, client_container) + + return [client_container] + PodGenerator.reconcile_containers( + base_containers[1:], client_containers[1:] + ) + + @staticmethod + def construct_pod( + dag_id, + task_id, + pod_id, + try_number, + kube_image, + date, + command, + pod_override_object, + base_worker_pod, + namespace, + worker_uuid + ): + """ + Construct a pod by gathering and consolidating the configuration from 3 places: + - airflow.cfg + - executor_config + - dynamic arguments + """ + try: + image = pod_override_object.spec.containers[0].image # type: ignore + if not image: + image = kube_image + except Exception: # pylint: disable=W0703 + image = kube_image + dynamic_pod = PodGenerator( + namespace=namespace, + image=image, + labels={ + 'airflow-worker': worker_uuid, + 'dag_id': make_safe_label_value(dag_id), + 'task_id': make_safe_label_value(task_id), + 'execution_date': datetime_to_label_safe_datestring(date), + 'try_number': str(try_number), + 'airflow_version': airflow_version.replace('+', '-'), + 'kubernetes_executor': 'True', + }, + annotations={ + 'dag_id': dag_id, + 'task_id': task_id, + 'execution_date': date.isoformat(), + 'try_number': str(try_number), + }, + cmds=command, + name=pod_id + ).gen_pod() + + # Reconcile the pods starting with the first chronologically, + # Pod from the pod_template_File -> Pod from executor_config arg -> Pod from the K8s executor + pod_list = [base_worker_pod, pod_override_object, dynamic_pod] + + return reduce(PodGenerator.reconcile_pods, pod_list) + + @staticmethod + def serialize_pod(pod): + """ + Converts a k8s.V1Pod into a jsonified object + """ + api_client = ApiClient() + return api_client.sanitize_for_serialization(pod) + + @staticmethod + def deserialize_model_file(path): + """ + :param path: Path to the file + :return: a kubernetes.client.models.V1Pod + + Unfortunately we need access to the private method + ``_ApiClient__deserialize_model`` from the kubernetes client. + This issue is tracked here; https://github.com/kubernetes-client/python/issues/977. + """ + api_client = ApiClient() + if os.path.exists(path): + with open(path) as stream: + pod = yaml.safe_load(stream) + else: + pod = yaml.safe_load(path) + + # pylint: disable=protected-access + return api_client._ApiClient__deserialize_model(pod, k8s.V1Pod) + + +def merge_objects(base_obj, client_obj): + """ + :param base_obj: has the base attributes which are overwritten if they exist + in the client_obj and remain if they do not exist in the client_obj + :param client_obj: the object that the client wants to create. + :return: the merged objects + """ + if not base_obj: + return client_obj + if not client_obj: + return base_obj + + client_obj_cp = copy.deepcopy(client_obj) + + if isinstance(base_obj, dict) and isinstance(client_obj_cp, dict): + base_obj_cp = copy.deepcopy(base_obj) + base_obj_cp.update(client_obj_cp) + return base_obj_cp + + for base_key in base_obj.to_dict().keys(): + base_val = getattr(base_obj, base_key, None) + if not getattr(client_obj, base_key, None) and base_val: + if not isinstance(client_obj_cp, dict): + setattr(client_obj_cp, base_key, base_val) + else: + client_obj_cp[base_key] = base_val + return client_obj_cp + + +def extend_object_field(base_obj, client_obj, field_name, field_to_merge="name"): + """ + :param base_obj: an object which has a property `field_name` that is a list + :param client_obj: an object which has a property `field_name` that is a list. + A copy of this object is returned with `field_name` modified + :param field_name: the name of the list field + :type field_name: str + :return: the client_obj with the property `field_name` being the two properties appended + """ + client_obj_cp = copy.deepcopy(client_obj) + base_obj_field = getattr(base_obj, field_name, None) + client_obj_field = getattr(client_obj, field_name, None) + + if (not isinstance(base_obj_field, list) and base_obj_field is not None) or \ + (not isinstance(client_obj_field, list) and client_obj_field is not None): + raise ValueError("The chosen field must be a list.") + + if not base_obj_field: + return client_obj_cp + if not client_obj_field: + setattr(client_obj_cp, field_name, base_obj_field) + return client_obj_cp + + if field_to_merge is None: + # no merge, just append + appended_fields = base_obj_field + client_obj_field + else: + base_obj_set = _get_dict_from_list(base_obj_field, field_to_merge) + client_obj_set = _get_dict_from_list(client_obj_field, field_to_merge) + + appended_fields = _merge_list_of_objects(base_obj_set, client_obj_set) + + setattr(client_obj_cp, field_name, appended_fields) + return client_obj_cp + + +def _merge_list_of_objects(base_obj_set, client_obj_set): + for k, v in base_obj_set.items(): + if k not in client_obj_set: + client_obj_set[k] = v + else: + client_obj_set[k] = merge_objects(v, client_obj_set[k]) + appended_field_keys = sorted(client_obj_set.keys()) + appended_fields = [client_obj_set[k] for k in appended_field_keys] + return appended_fields + + +def _get_dict_from_list(base_list, field_to_merge="name"): + """ + :type base_list: list(Optional[dict, *to_dict]) + """ + result = {} + for obj in base_list: + if isinstance(obj, dict): + result[obj[field_to_merge]] = obj + elif hasattr(obj, "to_dict"): + result[getattr(obj, field_to_merge)] = obj + else: + raise AirflowConfigException("Trying to merge invalid object {}".format(obj)) + return result diff --git a/airflow/contrib/kubernetes/pod_launcher.py b/airflow/kubernetes/pod_launcher.py similarity index 54% rename from airflow/contrib/kubernetes/pod_launcher.py rename to airflow/kubernetes/pod_launcher.py index 51ee1348586b3..468e0777dd024 100644 --- a/airflow/contrib/kubernetes/pod_launcher.py +++ b/airflow/kubernetes/pod_launcher.py @@ -17,31 +17,30 @@ """Launches PODs""" import json import time +import warnings from datetime import datetime as dt -from typing import Tuple, Optional - -from requests.exceptions import BaseHTTPError import tenacity - from kubernetes import watch, client +from kubernetes.client.api_client import ApiClient +from kubernetes.client import models as k8s from kubernetes.client.rest import ApiException from kubernetes.stream import stream as kubernetes_stream +from requests.exceptions import BaseHTTPError -from airflow.settings import pod_mutation_hook +from airflow import AirflowException +from airflow import settings +from airflow.contrib.kubernetes.pod import ( + Pod, _extract_env_vars_and_secrets, _extract_volumes, _extract_volume_mounts, + _extract_ports, _extract_security_context +) +from airflow.kubernetes.kube_client import get_kube_client +from airflow.kubernetes.pod_generator import PodDefaults, PodGenerator from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.state import State -from airflow import AirflowException - -from airflow.contrib.kubernetes.pod import Pod -from airflow.contrib.kubernetes.kubernetes_request_factory import \ - pod_request_factory as pod_factory -from .kube_client import get_kube_client - -class PodStatus(object): - """Status of the PODs""" +class PodStatus: PENDING = 'pending' RUNNING = 'running' FAILED = 'failed' @@ -68,56 +67,88 @@ def __init__(self, cluster_context=cluster_context) self._watch = watch.Watch() self.extract_xcom = extract_xcom - self.kube_req_factory = pod_factory.ExtractXcomPodRequestFactory( - ) if extract_xcom else pod_factory.SimplePodRequestFactory() def run_pod_async(self, pod, **kwargs): - """Runs POD asynchronously""" - pod_mutation_hook(pod) + """Runs POD asynchronously - req = self.kube_req_factory.create(pod) - self.log.debug('Pod Creation Request: \n%s', json.dumps(req, indent=2)) + :param pod: Pod to run + :type pod: k8s.V1Pod + """ + pod = self._mutate_pod_backcompat(pod) + + sanitized_pod = self._client.api_client.sanitize_for_serialization(pod) + json_pod = json.dumps(sanitized_pod, indent=2) + + self.log.debug('Pod Creation Request: \n%s', json_pod) try: - resp = self._client.create_namespaced_pod(body=req, namespace=pod.namespace, **kwargs) + resp = self._client.create_namespaced_pod(body=sanitized_pod, + namespace=pod.metadata.namespace, **kwargs) self.log.debug('Pod Creation Response: %s', resp) - except ApiException: - self.log.exception('Exception when attempting to create Namespaced Pod.') - raise + except Exception as e: + self.log.exception('Exception when attempting ' + 'to create Namespaced Pod: %s', json_pod) + raise e return resp + @staticmethod + def _mutate_pod_backcompat(pod): + """Backwards compatible Pod Mutation Hook""" + try: + dummy_pod = _convert_to_airflow_pod(pod) + settings.pod_mutation_hook(dummy_pod) + warnings.warn( + "Using `airflow.contrib.kubernetes.pod.Pod` is deprecated. " + "Please use `k8s.V1Pod` instead.", DeprecationWarning, stacklevel=2 + ) + dummy_pod = dummy_pod.to_v1_kubernetes_pod() + + new_pod = PodGenerator.reconcile_pods(pod, dummy_pod) + except AttributeError as e: + try: + settings.pod_mutation_hook(pod) + return pod + except AttributeError as e2: + raise Exception([e, e2]) + return new_pod + def delete_pod(self, pod): """Deletes POD""" try: self._client.delete_namespaced_pod( - pod.name, pod.namespace, body=client.V1DeleteOptions()) + pod.metadata.name, pod.metadata.namespace, body=client.V1DeleteOptions()) except ApiException as e: # If the pod is already deleted if e.status != 404: raise - def run_pod(self, pod, startup_timeout=120, get_logs=True): - # type: (Pod, int, bool) -> Tuple[State, Optional[str]] + def start_pod( + self, + pod, + startup_timeout): """ Launches the pod synchronously and waits for completion. - Args: - pod (Pod): - startup_timeout (int): Timeout for startup of the pod (if pod is pending for - too long, considers task a failure + + :param pod: + :param startup_timeout: Timeout for startup of the pod (if pod is pending for too long, fails task) + :return: """ resp = self.run_pod_async(pod) curr_time = dt.now() if resp.status.start_time is None: while self.pod_not_started(pod): + self.log.warning("Pod not yet started: %s", pod.metadata.name) delta = dt.now() - curr_time if delta.total_seconds() >= startup_timeout: raise AirflowException("Pod took too long to start") time.sleep(1) - self.log.debug('Pod not yet started') - - return self._monitor_pod(pod, get_logs) - def _monitor_pod(self, pod, get_logs): - # type: (Pod, bool) -> Tuple[State, Optional[str]] + def monitor_pod(self, pod, get_logs): + """ + :param pod: pod spec that will be monitored + :type pod : V1Pod + :param get_logs: whether to read the logs locally + :return: Tuple[State, Optional[str]] + """ if get_logs: logs = self.read_pod_logs(pod) @@ -126,13 +157,13 @@ def _monitor_pod(self, pod, get_logs): result = None if self.extract_xcom: while self.base_container_is_running(pod): - self.log.info('Container %s has state %s', pod.name, State.RUNNING) + self.log.info('Container %s has state %s', pod.metadata.name, State.RUNNING) time.sleep(2) result = self._extract_xcom(pod) self.log.info(result) result = json.loads(result) while self.pod_is_running(pod): - self.log.info('Pod %s has state %s', pod.name, State.RUNNING) + self.log.info('Pod %s has state %s', pod.metadata.name, State.RUNNING) time.sleep(2) return self._task_status(self.read_pod(pod)), result @@ -158,6 +189,8 @@ def base_container_is_running(self, pod): event = self.read_pod(pod) status = next(iter(filter(lambda s: s.name == 'base', event.status.container_statuses)), None) + if not status: + return False return status.state.running is not None @tenacity.retry( @@ -165,15 +198,15 @@ def base_container_is_running(self, pod): wait=tenacity.wait_exponential(), reraise=True ) - def read_pod_logs(self, pod): + def read_pod_logs(self, pod, tail_lines=10): """Reads log from the POD""" try: return self._client.read_namespaced_pod_log( - name=pod.name, - namespace=pod.namespace, + name=pod.metadata.name, + namespace=pod.metadata.namespace, container='base', follow=True, - tail_lines=10, + tail_lines=tail_lines, _preload_content=False ) except BaseHTTPError as e: @@ -181,6 +214,23 @@ def read_pod_logs(self, pod): 'There was an error reading the kubernetes API: {}'.format(e) ) + @tenacity.retry( + stop=tenacity.stop_after_attempt(3), + wait=tenacity.wait_exponential(), + reraise=True + ) + def read_pod_events(self, pod): + """Reads events from the POD""" + try: + return self._client.list_namespaced_event( + namespace=pod.metadata.namespace, + field_selector="involvedObject.name={}".format(pod.metadata.name) + ) + except BaseHTTPError as e: + raise AirflowException( + 'There was an error reading the kubernetes API: {}'.format(e) + ) + @tenacity.retry( stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(), @@ -189,7 +239,7 @@ def read_pod_logs(self, pod): def read_pod(self, pod): """Read POD information""" try: - return self._client.read_namespaced_pod(pod.name, pod.namespace) + return self._client.read_namespaced_pod(pod.metadata.name, pod.metadata.namespace) except BaseHTTPError as e: raise AirflowException( 'There was an error reading the kubernetes API: {}'.format(e) @@ -197,19 +247,19 @@ def read_pod(self, pod): def _extract_xcom(self, pod): resp = kubernetes_stream(self._client.connect_get_namespaced_pod_exec, - pod.name, pod.namespace, - container=self.kube_req_factory.SIDECAR_CONTAINER_NAME, + pod.metadata.name, pod.metadata.namespace, + container=PodDefaults.SIDECAR_CONTAINER_NAME, command=['/bin/sh'], stdin=True, stdout=True, stderr=True, tty=False, _preload_content=False) try: result = self._exec_pod_command( - resp, 'cat {}/return.json'.format(self.kube_req_factory.XCOM_MOUNT_PATH)) + resp, 'cat {}/return.json'.format(PodDefaults.XCOM_MOUNT_PATH)) self._exec_pod_command(resp, 'kill -s SIGINT 1') finally: resp.close() if result is None: - raise AirflowException('Failed to extract xcom from pod: {}'.format(pod.name)) + raise AirflowException('Failed to extract xcom from pod: {}'.format(pod.metadata.name)) return result def _exec_pod_command(self, resp, command): @@ -226,7 +276,7 @@ def _exec_pod_command(self, resp, command): return None def process_status(self, job_id, status): - """Process status infomration for the JOB""" + """Process status information for the JOB""" status = status.lower() if status == PodStatus.PENDING: return State.QUEUED @@ -241,3 +291,43 @@ def process_status(self, job_id, status): else: self.log.info('Event: Invalid state %s on job %s', status, job_id) return State.FAILED + + +def _convert_to_airflow_pod(pod): + """ + Converts a k8s V1Pod object into an `airflow.kubernetes.pod.Pod` object. + This function is purely for backwards compatibility + """ + base_container = pod.spec.containers[0] # type: k8s.V1Container + env_vars, secrets = _extract_env_vars_and_secrets(base_container.env) + volumes = _extract_volumes(pod.spec.volumes) + api_client = ApiClient() + init_containers = pod.spec.init_containers + image_pull_secrets = pod.spec.image_pull_secrets or [] + if pod.spec.init_containers is not None: + init_containers = [api_client.sanitize_for_serialization(i) for i in pod.spec.init_containers] + dummy_pod = Pod( + image=base_container.image, + envs=env_vars, + cmds=base_container.command, + args=base_container.args, + labels=pod.metadata.labels, + annotations=pod.metadata.annotations, + node_selectors=pod.spec.node_selector, + name=pod.metadata.name, + ports=_extract_ports(base_container.ports), + volumes=volumes, + volume_mounts=_extract_volume_mounts(base_container.volume_mounts), + namespace=pod.metadata.namespace, + image_pull_policy=base_container.image_pull_policy or 'IfNotPresent', + tolerations=pod.spec.tolerations, + init_containers=init_containers, + image_pull_secrets=",".join([i.name for i in image_pull_secrets]), + resources=base_container.resources, + service_account_name=pod.spec.service_account_name, + secrets=secrets, + affinity=api_client.sanitize_for_serialization(pod.spec.affinity), + hostnetwork=pod.spec.host_network, + security_context=_extract_security_context(pod.spec.security_context) + ) + return dummy_pod diff --git a/airflow/kubernetes/pod_runtime_info_env.py b/airflow/kubernetes/pod_runtime_info_env.py new file mode 100644 index 0000000000000..95fbe6bfe1865 --- /dev/null +++ b/airflow/kubernetes/pod_runtime_info_env.py @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Classes for interacting with Kubernetes API +""" + +import copy +from airflow.kubernetes.k8s_model import K8SModel + + +class PodRuntimeInfoEnv(K8SModel): + """Defines Pod runtime information as environment variable""" + + def __init__(self, name, field_path): + """ + Adds Kubernetes pod runtime information as environment variables such as namespace, pod IP, pod name. + Full list of options can be found in kubernetes documentation. + + :param name: the name of the environment variable + :type: name: str + :param field_path: path to pod runtime info. Ex: metadata.namespace | status.podIP + :type: field_path: str + """ + self.name = name + self.field_path = field_path + + def to_k8s_client_obj(self): + """ + :return: kubernetes.client.models.V1EnvVar + """ + import kubernetes.client.models as k8s + return k8s.V1EnvVar( + name=self.name, + value_from=k8s.V1EnvVarSource( + field_ref=k8s.V1ObjectFieldSelector( + field_path=self.field_path + ) + ) + ) + + def attach_to_pod(self, pod): + cp_pod = copy.deepcopy(pod) + env = self.to_k8s_client_obj() + cp_pod.spec.containers[0].env = cp_pod.spec.containers[0].env or [] + cp_pod.spec.containers[0].env.append(env) + return cp_pod diff --git a/airflow/kubernetes/refresh_config.py b/airflow/kubernetes/refresh_config.py new file mode 100644 index 0000000000000..e7c5c138c14e4 --- /dev/null +++ b/airflow/kubernetes/refresh_config.py @@ -0,0 +1,118 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +NOTE: this module can be removed once upstream client supports token refresh +see: https://github.com/kubernetes-client/python/issues/741 +""" + +import calendar +import logging +import os +import time +from datetime import datetime + +import yaml +from kubernetes.client import Configuration +from kubernetes.config.exec_provider import ExecProvider +from kubernetes.config.kube_config import KUBE_CONFIG_DEFAULT_LOCATION, KubeConfigLoader + + +class RefreshKubeConfigLoader(KubeConfigLoader): + """ + Patched KubeConfigLoader, this subclass takes expirationTimestamp into + account and sets api key refresh callback hook in Configuration object + """ + def __init__(self, *args, **kwargs): + KubeConfigLoader.__init__(self, *args, **kwargs) + self.api_key_expire_ts = None + + def _load_from_exec_plugin(self): + """ + We override _load_from_exec_plugin method to also read and store + expiration timestamp for aws-iam-authenticator. It will be later + used for api token refresh. + """ + if 'exec' not in self._user: + return None + try: + status = ExecProvider(self._user['exec']).run() + if 'token' not in status: + logging.error('exec: missing token field in plugin output') + return None + self.token = "Bearer %s" % status['token'] # pylint: disable=W0201 + ts_str = status.get('expirationTimestamp') + if ts_str: + self.api_key_expire_ts = calendar.timegm( + datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z").timetuple(), + ) + return True + except Exception as e: # pylint: disable=W0703 + logging.error(str(e)) + + def refresh_api_key(self, client_configuration): + """ + Refresh API key if expired + """ + if self.api_key_expire_ts and time.time() >= self.api_key_expire_ts: + self.load_and_set(client_configuration) + + def load_and_set(self, client_configuration): + KubeConfigLoader.load_and_set(self, client_configuration) + client_configuration.refresh_api_key = self.refresh_api_key + + +class RefreshConfiguration(Configuration): + """ + Patched Configuration, this subclass taskes api key refresh callback hook + into account + """ + def __init__(self, *args, **kwargs): + Configuration.__init__(self, *args, **kwargs) + self.refresh_api_key = None + + def get_api_key_with_prefix(self, identifier): + if self.refresh_api_key: + self.refresh_api_key(self) # pylint: disable=E1102 + return Configuration.get_api_key_with_prefix(self, identifier) + + +def _get_kube_config_loader_for_yaml_file(filename, **kwargs): + """ + Adapted from the upstream _get_kube_config_loader_for_yaml_file function, changed + KubeConfigLoader to RefreshKubeConfigLoader + """ + with open(filename) as f: + return RefreshKubeConfigLoader( + config_dict=yaml.safe_load(f), + config_base_path=os.path.abspath(os.path.dirname(filename)), + **kwargs) + + +def load_kube_config(client_configuration, config_file=None, context=None): + """ + Adapted from the upstream load_kube_config function, changes: + - removed persist_config argument since it's not being used + - remove `client_configuration is None` branch since we always pass + in client configuration + """ + if config_file is None: + config_file = os.path.expanduser(KUBE_CONFIG_DEFAULT_LOCATION) + + loader = _get_kube_config_loader_for_yaml_file( + config_file, active_context=context, config_persister=None) + loader.load_and_set(client_configuration) diff --git a/airflow/kubernetes/secret.py b/airflow/kubernetes/secret.py new file mode 100644 index 0000000000000..df077478c5254 --- /dev/null +++ b/airflow/kubernetes/secret.py @@ -0,0 +1,138 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Classes for interacting with Kubernetes API +""" + +import uuid +import copy +from airflow.exceptions import AirflowConfigException +from airflow.kubernetes.k8s_model import K8SModel + + +class Secret(K8SModel): + """Defines Kubernetes Secret Volume""" + + def __init__(self, deploy_type, deploy_target, secret, key=None): + """ + Initialize a Kubernetes Secret Object. Used to track requested secrets from + the user. + + :param deploy_type: The type of secret deploy in Kubernetes, either `env` or + `volume` + :type deploy_type: str + :param deploy_target: (Optional) The environment variable when + `deploy_type` `env` or file path when `deploy_type` `volume` where + expose secret. If `key` is not provided deploy target should be None. + :type deploy_target: str or None + :param secret: Name of the secrets object in Kubernetes + :type secret: str + :param key: (Optional) Key of the secret within the Kubernetes Secret + if not provided in `deploy_type` `env` it will mount all secrets in object + :type key: str or None + """ + if deploy_type not in ('env', 'volume'): + raise AirflowConfigException("deploy_type must be env or volume") + + self.deploy_type = deploy_type + self.deploy_target = deploy_target + + if deploy_target is not None and deploy_type == 'env': + # if deploying to env, capitalize the deploy target + self.deploy_target = deploy_target.upper() + + if key is not None and deploy_target is None and deploy_type == "env": + raise AirflowConfigException( + 'If `key` is set, `deploy_target` should not be None' + ) + + self.secret = secret + self.key = key + + def to_env_secret(self): + import kubernetes.client.models as k8s + return k8s.V1EnvVar( + name=self.deploy_target, + value_from=k8s.V1EnvVarSource( + secret_key_ref=k8s.V1SecretKeySelector( + name=self.secret, + key=self.key + ) + ) + ) + + def to_env_from_secret(self): + import kubernetes.client.models as k8s + return k8s.V1EnvFromSource( + secret_ref=k8s.V1SecretEnvSource(name=self.secret) + ) + + def to_volume_secret(self): + import kubernetes.client.models as k8s + vol_id = 'secretvol{}'.format(uuid.uuid4()) + if self.deploy_target: + volume_mount = k8s.V1VolumeMount( + mount_path=self.deploy_target, + name=vol_id, + read_only=True + ) + else: + volume_mount = None + return ( + k8s.V1Volume( + name=vol_id, + secret=k8s.V1SecretVolumeSource( + secret_name=self.secret + ) + ), + volume_mount + ) + + def attach_to_pod(self, pod): + cp_pod = copy.deepcopy(pod) + if self.deploy_type == 'volume': + volume, volume_mount = self.to_volume_secret() + cp_pod.spec.volumes = pod.spec.volumes or [] + cp_pod.spec.volumes.append(volume) + if volume_mount: + cp_pod.spec.containers[0].volume_mounts = pod.spec.containers[0].volume_mounts or [] + cp_pod.spec.containers[0].volume_mounts.append(volume_mount) + if self.deploy_type == 'env' and self.key is not None: + env = self.to_env_secret() + cp_pod.spec.containers[0].env = cp_pod.spec.containers[0].env or [] + cp_pod.spec.containers[0].env.append(env) + if self.deploy_type == 'env' and self.key is None: + env_from = self.to_env_from_secret() + cp_pod.spec.containers[0].env_from = cp_pod.spec.containers[0].env_from or [] + cp_pod.spec.containers[0].env_from.append(env_from) + return cp_pod + + def __eq__(self, other): + return ( + self.deploy_type == other.deploy_type and + self.deploy_target == other.deploy_target and + self.secret == other.secret and + self.key == other.key + ) + + def __repr__(self): + return 'Secret({}, {}, {}, {})'.format( + self.deploy_type, + self.deploy_target, + self.secret, + self.key + ) diff --git a/airflow/kubernetes/volume.py b/airflow/kubernetes/volume.py new file mode 100644 index 0000000000000..9e5e5c44dd140 --- /dev/null +++ b/airflow/kubernetes/volume.py @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Classes for interacting with Kubernetes API +""" + +import copy +from airflow.kubernetes.k8s_model import K8SModel + + +class Volume(K8SModel): + def __init__(self, name, configs): + """ Adds Kubernetes Volume to pod. allows pod to access features like ConfigMaps + and Persistent Volumes + :param name: the name of the volume mount + :type name: str + :param configs: dictionary of any features needed for volume. + We purposely keep this vague since there are multiple volume types with changing + configs. + :type configs: dict + """ + self.name = name + self.configs = configs + + def to_k8s_client_obj(self): + from kubernetes.client import models as k8s + resp = k8s.V1Volume(name=self.name) + for k, v in self.configs.items(): + snake_key = Volume._convert_to_snake_case(k) + if hasattr(resp, snake_key): + setattr(resp, snake_key, v) + else: + raise AttributeError("V1Volume does not have attribute {}".format(k)) + return resp + + def attach_to_pod(self, pod): + cp_pod = copy.deepcopy(pod) + volume = self.to_k8s_client_obj() + cp_pod.spec.volumes = pod.spec.volumes or [] + cp_pod.spec.volumes.append(volume) + return cp_pod + + # source: https://www.geeksforgeeks.org/python-program-to-convert-camel-case-string-to-snake-case/ + @staticmethod + def _convert_to_snake_case(str): + return ''.join(['_' + i.lower() if i.isupper() else i for i in str]).lstrip('_') diff --git a/airflow/kubernetes/volume_mount.py b/airflow/kubernetes/volume_mount.py new file mode 100644 index 0000000000000..ab9c34a9c2a47 --- /dev/null +++ b/airflow/kubernetes/volume_mount.py @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Classes for interacting with Kubernetes API +""" + +import copy +from airflow.kubernetes.k8s_model import K8SModel + + +class VolumeMount(K8SModel): + __slots__ = ('name', 'mount_path', 'sub_path', 'read_only') + """ + Initialize a Kubernetes Volume Mount. Used to mount pod level volumes to + running container. + + :param name: the name of the volume mount + :type name: str + :param mount_path: + :type mount_path: str + :param sub_path: subpath within the volume mount + :type sub_path: Optional[str] + :param read_only: whether to access pod with read-only mode + :type read_only: bool + """ + def __init__(self, name, mount_path, sub_path, read_only): + self.name = name + self.mount_path = mount_path + self.sub_path = sub_path + self.read_only = read_only + + def to_k8s_client_obj(self): + """ + Converts to k8s object. + + :return Volume Mount k8s object + """ + import kubernetes.client.models as k8s + return k8s.V1VolumeMount( + name=self.name, + mount_path=self.mount_path, + sub_path=self.sub_path, + read_only=self.read_only + ) + + def attach_to_pod(self, pod): + """ + Attaches to pod + + :return Copy of the Pod object + + """ + cp_pod = copy.deepcopy(pod) + volume_mount = self.to_k8s_client_obj() + cp_pod.spec.containers[0].volume_mounts = pod.spec.containers[0].volume_mounts or [] + cp_pod.spec.containers[0].volume_mounts.append(volume_mount) + return cp_pod diff --git a/airflow/kubernetes/worker_configuration.py b/airflow/kubernetes/worker_configuration.py new file mode 100644 index 0000000000000..327d7d0d9a47c --- /dev/null +++ b/airflow/kubernetes/worker_configuration.py @@ -0,0 +1,459 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os + +import kubernetes.client.models as k8s +import six + +from airflow.configuration import conf +from airflow.kubernetes.k8s_model import append_to_pod +from airflow.kubernetes.pod_generator import PodGenerator +from airflow.kubernetes.secret import Secret +from airflow.utils.log.logging_mixin import LoggingMixin + + +class WorkerConfiguration(LoggingMixin): + """ + Contains Kubernetes Airflow Worker configuration logic + + :param kube_config: the kubernetes configuration from airflow.cfg + :type kube_config: airflow.executors.kubernetes_executor.KubeConfig + """ + + dags_volume_name = 'airflow-dags' + logs_volume_name = 'airflow-logs' + git_sync_ssh_secret_volume_name = 'git-sync-ssh-key' + git_ssh_key_secret_key = 'gitSshKey' + git_sync_ssh_known_hosts_volume_name = 'git-sync-known-hosts' + git_ssh_known_hosts_configmap_key = 'known_hosts' + + def __init__(self, kube_config): + self.kube_config = kube_config + self.worker_airflow_home = self.kube_config.airflow_home + self.worker_airflow_dags = self.kube_config.dags_folder + self.worker_airflow_logs = self.kube_config.base_log_folder + + super(WorkerConfiguration, self).__init__() + + def _get_init_containers(self): + """When using git to retrieve the DAGs, use the GitSync Init Container""" + # If we're using volume claims to mount the dags, no init container is needed + if self.kube_config.dags_volume_claim or \ + self.kube_config.dags_volume_host or self.kube_config.dags_in_image: + return [] + + # Otherwise, define a git-sync init container + init_environment = [k8s.V1EnvVar( + name='GIT_SYNC_REPO', + value=self.kube_config.git_repo + ), k8s.V1EnvVar( + name='GIT_SYNC_BRANCH', + value=self.kube_config.git_branch + ), k8s.V1EnvVar( + name='GIT_SYNC_ROOT', + value=self.kube_config.git_sync_root + ), k8s.V1EnvVar( + name='GIT_SYNC_DEST', + value=self.kube_config.git_sync_dest + ), k8s.V1EnvVar( + name='GIT_SYNC_DEPTH', + value=self.kube_config.git_sync_depth + ), k8s.V1EnvVar( + name='GIT_SYNC_ONE_TIME', + value='true' + ), k8s.V1EnvVar( + name='GIT_SYNC_REV', + value=self.kube_config.git_sync_rev + )] + for env_var_name, env_var_val in six.iteritems(self.kube_config.kube_env_vars): + init_environment.append(k8s.V1EnvVar( + name=env_var_name, + value=env_var_val + )) + if self.kube_config.git_user: + init_environment.append(k8s.V1EnvVar( + name='GIT_SYNC_USERNAME', + value=self.kube_config.git_user + )) + if self.kube_config.git_password: + init_environment.append(k8s.V1EnvVar( + name='GIT_SYNC_PASSWORD', + value=self.kube_config.git_password + )) + + volume_mounts = [k8s.V1VolumeMount( + mount_path=self.kube_config.git_sync_root, + name=self.dags_volume_name, + read_only=False + )] + + if self.kube_config.git_sync_credentials_secret: + init_environment.extend([ + k8s.V1EnvVar( + name='GIT_SYNC_USERNAME', + value_from=k8s.V1EnvVarSource( + secret_key_ref=k8s.V1SecretKeySelector( + name=self.kube_config.git_sync_credentials_secret, + key='GIT_SYNC_USERNAME') + ) + ), + k8s.V1EnvVar( + name='GIT_SYNC_PASSWORD', + value_from=k8s.V1EnvVarSource( + secret_key_ref=k8s.V1SecretKeySelector( + name=self.kube_config.git_sync_credentials_secret, + key='GIT_SYNC_PASSWORD') + ) + ) + ]) + + if self.kube_config.git_ssh_key_secret_name: + volume_mounts.append(k8s.V1VolumeMount( + name=self.git_sync_ssh_secret_volume_name, + mount_path='/etc/git-secret/ssh', + sub_path='ssh' + )) + + init_environment.extend([ + k8s.V1EnvVar( + name='GIT_SSH_KEY_FILE', + value='/etc/git-secret/ssh' + ), + k8s.V1EnvVar( + name='GIT_SYNC_ADD_USER', + value='true' + ), + k8s.V1EnvVar( + name='GIT_SYNC_SSH', + value='true' + ) + ]) + + if self.kube_config.git_ssh_known_hosts_configmap_name: + volume_mounts.append(k8s.V1VolumeMount( + name=self.git_sync_ssh_known_hosts_volume_name, + mount_path='/etc/git-secret/known_hosts', + sub_path='known_hosts' + )) + init_environment.extend([k8s.V1EnvVar( + name='GIT_KNOWN_HOSTS', + value='true' + ), k8s.V1EnvVar( + name='GIT_SSH_KNOWN_HOSTS_FILE', + value='/etc/git-secret/known_hosts' + )]) + else: + init_environment.append(k8s.V1EnvVar( + name='GIT_KNOWN_HOSTS', + value='false' + )) + + init_containers = k8s.V1Container( + name=self.kube_config.git_sync_init_container_name, + image=self.kube_config.git_sync_container, + env=init_environment, + volume_mounts=volume_mounts + ) + + if self.kube_config.git_sync_run_as_user != "": + init_containers.security_context = k8s.V1SecurityContext( + run_as_user=self.kube_config.git_sync_run_as_user + ) # git-sync user + + return [init_containers] + + def _get_environment(self): + """Defines any necessary environment variables for the pod executor""" + env = {} + + for env_var_name, env_var_val in six.iteritems(self.kube_config.kube_env_vars): + env[env_var_name] = env_var_val + + env["AIRFLOW__CORE__EXECUTOR"] = "LocalExecutor" + + if self.kube_config.airflow_configmap: + env['AIRFLOW_HOME'] = self.worker_airflow_home + env['AIRFLOW__CORE__DAGS_FOLDER'] = self.worker_airflow_dags + if (not self.kube_config.airflow_configmap and + 'AIRFLOW__CORE__SQL_ALCHEMY_CONN' not in self.kube_config.kube_secrets): + env['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = conf.get("core", "SQL_ALCHEMY_CONN") + if self.kube_config.git_dags_folder_mount_point: + # /root/airflow/dags/repo/dags + dag_volume_mount_path = os.path.join( + self.kube_config.git_dags_folder_mount_point, + self.kube_config.git_sync_dest, # repo + self.kube_config.git_subpath # dags + ) + env['AIRFLOW__CORE__DAGS_FOLDER'] = dag_volume_mount_path + return env + + def _get_configmaps(self): + """Extracts any configmapRefs to envFrom""" + env_from = [] + + if self.kube_config.env_from_configmap_ref: + for config_map_ref in self.kube_config.env_from_configmap_ref.split(','): + env_from.append( + k8s.V1EnvFromSource(config_map_ref=k8s.V1ConfigMapEnvSource(config_map_ref)) + ) + + if self.kube_config.env_from_secret_ref: + for secret_ref in self.kube_config.env_from_secret_ref.split(','): + env_from.append( + k8s.V1EnvFromSource(secret_ref=k8s.V1SecretEnvSource(secret_ref)) + ) + + return env_from + + def _get_env_from(self): + """Extracts any configmapRefs to envFrom""" + env_from = [] + + if self.kube_config.env_from_configmap_ref: + for config_map_ref in self.kube_config.env_from_configmap_ref.split(','): + env_from.append( + k8s.V1EnvFromSource(config_map_ref=k8s.V1ConfigMapEnvSource(config_map_ref)) + ) + + if self.kube_config.env_from_secret_ref: + for secret_ref in self.kube_config.env_from_secret_ref.split(','): + env_from.append( + k8s.V1EnvFromSource(secret_ref=k8s.V1SecretEnvSource(secret_ref)) + ) + + return env_from + + def _get_secrets(self): + """Defines any necessary secrets for the pod executor""" + worker_secrets = [] + + for env_var_name, obj_key_pair in six.iteritems(self.kube_config.kube_secrets): + k8s_secret_obj, k8s_secret_key = obj_key_pair.split('=') + worker_secrets.append( + Secret('env', env_var_name, k8s_secret_obj, k8s_secret_key) + ) + + if self.kube_config.env_from_secret_ref: + for secret_ref in self.kube_config.env_from_secret_ref.split(','): + worker_secrets.append( + Secret('env', None, secret_ref) + ) + + return worker_secrets + + def _get_image_pull_secrets(self): + """Extracts any image pull secrets for fetching container(s)""" + if not self.kube_config.image_pull_secrets: + return [] + pull_secrets = self.kube_config.image_pull_secrets.split(',') + return list(map(lambda name: k8s.V1LocalObjectReference(name), pull_secrets)) + + def _get_security_context(self): + """Defines the security context""" + + security_context = k8s.V1PodSecurityContext() + + if self.kube_config.worker_run_as_user != "": + security_context.run_as_user = self.kube_config.worker_run_as_user + + if self.kube_config.worker_fs_group != "": + security_context.fs_group = self.kube_config.worker_fs_group + + # set fs_group to 65533 if not explicitly specified and using git ssh keypair auth + if self.kube_config.git_ssh_key_secret_name and security_context.fs_group is None: + security_context.fs_group = 65533 + + return security_context + + def _get_labels(self, kube_executor_labels, labels): + copy = self.kube_config.kube_labels.copy() + copy.update(kube_executor_labels) + copy.update(labels) + return copy + + def _get_volume_mounts(self): + volume_mounts = { + self.dags_volume_name: k8s.V1VolumeMount( + name=self.dags_volume_name, + mount_path=self.generate_dag_volume_mount_path(), + read_only=True, + ), + self.logs_volume_name: k8s.V1VolumeMount( + name=self.logs_volume_name, + mount_path=self.worker_airflow_logs, + ) + } + + if self.kube_config.dags_volume_subpath: + volume_mounts[self.dags_volume_name].sub_path = self.kube_config.dags_volume_subpath + + if self.kube_config.logs_volume_subpath: + volume_mounts[self.logs_volume_name].sub_path = self.kube_config.logs_volume_subpath + + if self.kube_config.dags_in_image: + del volume_mounts[self.dags_volume_name] + + # Mount the airflow.cfg file via a configmap the user has specified + if self.kube_config.airflow_configmap: + config_volume_name = 'airflow-config' + config_path = '{}/airflow.cfg'.format(self.worker_airflow_home) + volume_mounts[config_volume_name] = k8s.V1VolumeMount( + name=config_volume_name, + mount_path=config_path, + sub_path='airflow.cfg', + read_only=True + ) + if self.kube_config.airflow_local_settings_configmap: + config_path = '{}/config/airflow_local_settings.py'.format(self.worker_airflow_home) + + if self.kube_config.airflow_local_settings_configmap != self.kube_config.airflow_configmap: + config_volume_name = 'airflow-local-settings' + + volume_mounts[config_volume_name] = k8s.V1VolumeMount( + name=config_volume_name, + mount_path=config_path, + sub_path='airflow_local_settings.py', + read_only=True + ) + + else: + volume_mounts['airflow-local-settings'] = k8s.V1VolumeMount( + name='airflow-config', + mount_path=config_path, + sub_path='airflow_local_settings.py', + read_only=True + ) + + return list(volume_mounts.values()) + + def _get_volumes(self): + def _construct_volume(name, claim, host): + volume = k8s.V1Volume(name=name) + + if claim: + volume.persistent_volume_claim = k8s.V1PersistentVolumeClaimVolumeSource( + claim_name=claim + ) + elif host: + volume.host_path = k8s.V1HostPathVolumeSource( + path=host, + type='' + ) + else: + volume.empty_dir = {} + + return volume + + volumes = { + self.dags_volume_name: _construct_volume( + self.dags_volume_name, + self.kube_config.dags_volume_claim, + self.kube_config.dags_volume_host + ), + self.logs_volume_name: _construct_volume( + self.logs_volume_name, + self.kube_config.logs_volume_claim, + self.kube_config.logs_volume_host + ) + } + + if self.kube_config.dags_in_image: + del volumes[self.dags_volume_name] + + # Get the SSH key from secrets as a volume + if self.kube_config.git_ssh_key_secret_name: + volumes[self.git_sync_ssh_secret_volume_name] = k8s.V1Volume( + name=self.git_sync_ssh_secret_volume_name, + secret=k8s.V1SecretVolumeSource( + secret_name=self.kube_config.git_ssh_key_secret_name, + items=[k8s.V1KeyToPath( + key=self.git_ssh_key_secret_key, + path='ssh', + mode=0o440 + )] + ) + ) + + if self.kube_config.git_ssh_known_hosts_configmap_name: + volumes[self.git_sync_ssh_known_hosts_volume_name] = k8s.V1Volume( + name=self.git_sync_ssh_known_hosts_volume_name, + config_map=k8s.V1ConfigMapVolumeSource( + name=self.kube_config.git_ssh_known_hosts_configmap_name, + default_mode=0o440 + ) + ) + + # Mount the airflow.cfg file via a configmap the user has specified + if self.kube_config.airflow_configmap: + config_volume_name = 'airflow-config' + volumes[config_volume_name] = k8s.V1Volume( + name=config_volume_name, + config_map=k8s.V1ConfigMapVolumeSource( + name=self.kube_config.airflow_configmap + ) + ) + + if self.kube_config.airflow_local_settings_configmap: + if self.kube_config.airflow_local_settings_configmap != self.kube_config.airflow_configmap: + config_volume_name = 'airflow-local-settings' + volumes[config_volume_name] = k8s.V1Volume( + name=config_volume_name, + config_map=k8s.V1ConfigMapVolumeSource( + name=self.kube_config.airflow_local_settings_configmap + ) + ) + + return list(volumes.values()) + + def generate_dag_volume_mount_path(self): + """Generate path for DAG volume""" + + if self.kube_config.dags_volume_mount_point: + return self.kube_config.dags_volume_mount_point + + if self.kube_config.dags_volume_claim or self.kube_config.dags_volume_host: + return self.worker_airflow_dags + + return self.kube_config.git_dags_folder_mount_point + + def as_pod(self): + """Creates POD.""" + if self.kube_config.pod_template_file: + return PodGenerator(pod_template_file=self.kube_config.pod_template_file).gen_pod() + pod = PodGenerator( + image=self.kube_config.kube_image, + image_pull_policy=self.kube_config.kube_image_pull_policy or 'IfNotPresent', + image_pull_secrets=self.kube_config.image_pull_secrets, + volumes=self._get_volumes(), + volume_mounts=self._get_volume_mounts(), + init_containers=self._get_init_containers(), + labels=self.kube_config.kube_labels, + annotations=self.kube_config.kube_annotations, + affinity=self.kube_config.kube_affinity, + tolerations=self.kube_config.kube_tolerations, + envs=self._get_environment(), + node_selectors=self.kube_config.kube_node_selectors, + service_account_name=self.kube_config.worker_service_account_name or 'default', + restart_policy='Never' + ).gen_pod() + + pod.spec.containers[0].env_from = pod.spec.containers[0].env_from or [] + pod.spec.containers[0].env_from.extend(self._get_env_from()) + pod.spec.security_context = self._get_security_context() + + return append_to_pod(pod, self._get_secrets()) diff --git a/airflow/lineage/__init__.py b/airflow/lineage/__init__.py index f444139f00979..370e9c2657ebd 100644 --- a/airflow/lineage/__init__.py +++ b/airflow/lineage/__init__.py @@ -16,11 +16,15 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +""" +Provides lineage support functions +""" +import logging + from functools import wraps from airflow.configuration import conf from airflow.lineage.datasets import DataSet -from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.module_loading import import_string from itertools import chain @@ -28,7 +32,7 @@ PIPELINE_OUTLETS = "pipeline_outlets" PIPELINE_INLETS = "pipeline_inlets" -log = LoggingMixin().log +log = logging.getLogger(__name__) def _get_backend(): diff --git a/airflow/lineage/datasets.py b/airflow/lineage/datasets.py index 260277065b6f0..3d61e5d3f4843 100644 --- a/airflow/lineage/datasets.py +++ b/airflow/lineage/datasets.py @@ -16,6 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import json import six from typing import List @@ -62,7 +63,11 @@ def __getattr__(self, attr): if attr in self.attributes: if self.context: env = Environment() - return env.from_string(self._data.get(attr)).render(**self.context) + # dump to json here in order to be able to manage dicts and lists + rendered = env.from_string( + json.dumps(self._data.get(attr)) + ).render(**self.context) + return json.loads(rendered) return self._data.get(attr) @@ -82,7 +87,9 @@ def as_dict(self): env = Environment() if self.context: for key, value in six.iteritems(attributes): - attributes[key] = env.from_string(value).render(**self.context) + attributes[key] = json.loads( + env.from_string(json.dumps(value)).render(**self.context) + ) d = { "typeName": self.type_name, diff --git a/airflow/migrations/env.py b/airflow/migrations/env.py index 2de0c2f391721..234c795a9e254 100644 --- a/airflow/migrations/env.py +++ b/airflow/migrations/env.py @@ -81,6 +81,7 @@ def run_migrations_online(): with connectable.connect() as connection: context.configure( connection=connection, + transaction_per_migration=True, target_metadata=target_metadata, compare_type=COMPARE_TYPE, render_as_batch=True diff --git a/airflow/migrations/versions/03afc6b6f902_increase_length_of_fab_ab_view_menu_.py b/airflow/migrations/versions/03afc6b6f902_increase_length_of_fab_ab_view_menu_.py new file mode 100644 index 0000000000000..aeb5665e9b34a --- /dev/null +++ b/airflow/migrations/versions/03afc6b6f902_increase_length_of_fab_ab_view_menu_.py @@ -0,0 +1,90 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Increase length of FAB ab_view_menu.name column + +Revision ID: 03afc6b6f902 +Revises: 92c57b58940d +Create Date: 2020-11-13 22:21:41.619565 + +""" + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.engine.reflection import Inspector + +# revision identifiers, used by Alembic. +revision = '03afc6b6f902' +down_revision = '92c57b58940d' +branch_labels = None +depends_on = None + + +def upgrade(): + """Apply Increase length of FAB ab_view_menu.name column""" + conn = op.get_bind() + inspector = Inspector.from_engine(conn) + tables = inspector.get_table_names() + + if "ab_view_menu" in tables: + if conn.dialect.name == "sqlite": + op.execute("PRAGMA foreign_keys=off") + op.execute( + """ + CREATE TABLE IF NOT EXISTS ab_view_menu_dg_tmp + ( + id INTEGER NOT NULL PRIMARY KEY, + name VARCHAR(250) NOT NULL UNIQUE + ); + """ + ) + op.execute("INSERT INTO ab_view_menu_dg_tmp(id, name) select id, name from ab_view_menu;") + op.execute("DROP TABLE ab_view_menu") + op.execute("ALTER TABLE ab_view_menu_dg_tmp rename to ab_view_menu;") + op.execute("PRAGMA foreign_keys=on") + else: + op.alter_column( + table_name='ab_view_menu', column_name='name', type_=sa.String(length=250), nullable=False + ) + + +def downgrade(): + """Unapply Increase length of FAB ab_view_menu.name column""" + conn = op.get_bind() + inspector = Inspector.from_engine(conn) + tables = inspector.get_table_names() + if "ab_view_menu" in tables: + if conn.dialect.name == "sqlite": + op.execute("PRAGMA foreign_keys=off") + op.execute( + """ + CREATE TABLE IF NOT EXISTS ab_view_menu_dg_tmp + ( + id INTEGER NOT NULL PRIMARY KEY, + name VARCHAR(100) NOT NULL UNIQUE + ); + """ + ) + op.execute("INSERT INTO ab_view_menu_dg_tmp(id, name) select id, name from ab_view_menu;") + op.execute("DROP TABLE ab_view_menu") + op.execute("ALTER TABLE ab_view_menu_dg_tmp rename to ab_view_menu;") + op.execute("PRAGMA foreign_keys=on") + else: + op.alter_column( + table_name='ab_view_menu', column_name='name', type_=sa.String(length=100), nullable=False + ) diff --git a/airflow/migrations/versions/74effc47d867_change_datetime_to_datetime2_6_on_mssql_.py b/airflow/migrations/versions/74effc47d867_change_datetime_to_datetime2_6_on_mssql_.py index a9ef785131d91..c5630ddf77006 100644 --- a/airflow/migrations/versions/74effc47d867_change_datetime_to_datetime2_6_on_mssql_.py +++ b/airflow/migrations/versions/74effc47d867_change_datetime_to_datetime2_6_on_mssql_.py @@ -231,7 +231,7 @@ def get_table_constraints(conn, table_name): FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS tc JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE AS ccu ON ccu.CONSTRAINT_NAME = tc.CONSTRAINT_NAME WHERE tc.TABLE_NAME = '{table_name}' AND - (tc.CONSTRAINT_TYPE = 'PRIMARY KEY' or tc.CONSTRAINT_TYPE = 'Unique') + (tc.CONSTRAINT_TYPE = 'PRIMARY KEY' or UPPER(tc.CONSTRAINT_TYPE) = 'UNIQUE') """.format(table_name=table_name) result = conn.execute(query).fetchall() constraint_dict = defaultdict(list) diff --git a/airflow/migrations/versions/852ae6c715af_add_rendered_task_instance_fields_table.py b/airflow/migrations/versions/852ae6c715af_add_rendered_task_instance_fields_table.py index c9f24ec2e6151..01357e7c6f706 100644 --- a/airflow/migrations/versions/852ae6c715af_add_rendered_task_instance_fields_table.py +++ b/airflow/migrations/versions/852ae6c715af_add_rendered_task_instance_fields_table.py @@ -46,7 +46,7 @@ def upgrade(): # versions, check for the function existing. try: conn.execute("SELECT JSON_VALID(1)").fetchone() - except sa.exc.OperationalError: + except (sa.exc.OperationalError, sa.exc.ProgrammingError): json_type = sa.Text op.create_table( diff --git a/airflow/migrations/versions/92c57b58940d_add_fab_tables.py b/airflow/migrations/versions/92c57b58940d_add_fab_tables.py new file mode 100644 index 0000000000000..38f3c618eb8e6 --- /dev/null +++ b/airflow/migrations/versions/92c57b58940d_add_fab_tables.py @@ -0,0 +1,182 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Create FAB Tables + +Revision ID: 92c57b58940d +Revises: da3f683c3a5a +Create Date: 2020-11-13 19:27:10.161814 + +""" + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.engine.reflection import Inspector + +# revision identifiers, used by Alembic. +revision = '92c57b58940d' +down_revision = 'da3f683c3a5a' +branch_labels = None +depends_on = None + + +def upgrade(): + """Create FAB Tables""" + conn = op.get_bind() + inspector = Inspector.from_engine(conn) + tables = inspector.get_table_names() + if "ab_permission" not in tables: + op.create_table( + 'ab_permission', + sa.Column('id', sa.Integer(), nullable=False, primary_key=True), + sa.Column('name', sa.String(length=100), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('name'), + ) + + if "ab_view_menu" not in tables: + op.create_table( + 'ab_view_menu', + sa.Column('id', sa.Integer(), nullable=False, primary_key=True), + sa.Column('name', sa.String(length=100), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('name'), + ) + + if "ab_role" not in tables: + op.create_table( + 'ab_role', + sa.Column('id', sa.Integer(), nullable=False, primary_key=True), + sa.Column('name', sa.String(length=64), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('name'), + ) + + if "ab_permission_view" not in tables: + op.create_table( + 'ab_permission_view', + sa.Column('id', sa.Integer(), nullable=False, primary_key=True), + sa.Column('permission_id', sa.Integer(), nullable=True), + sa.Column('view_menu_id', sa.Integer(), nullable=True), + sa.ForeignKeyConstraint(['permission_id'], ['ab_permission.id']), + sa.ForeignKeyConstraint(['view_menu_id'], ['ab_view_menu.id']), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('permission_id', 'view_menu_id'), + ) + + if "ab_permission_view_role" not in tables: + op.create_table( + 'ab_permission_view_role', + sa.Column('id', sa.Integer(), nullable=False, primary_key=True), + sa.Column('permission_view_id', sa.Integer(), nullable=True), + sa.Column('role_id', sa.Integer(), nullable=True), + sa.ForeignKeyConstraint(['permission_view_id'], ['ab_permission_view.id']), + sa.ForeignKeyConstraint(['role_id'], ['ab_role.id']), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint("permission_view_id", "role_id"), + ) + + if "ab_user" not in tables: + op.create_table( + 'ab_user', + sa.Column('id', sa.Integer(), nullable=False, primary_key=True), + sa.Column('first_name', sa.String(length=64), nullable=False), + sa.Column('last_name', sa.String(length=64), nullable=False), + sa.Column('username', sa.String(length=64), nullable=False), + sa.Column('password', sa.String(length=256), nullable=True), + sa.Column('active', sa.Boolean(), nullable=True), + sa.Column('email', sa.String(length=64), nullable=False), + sa.Column('last_login', sa.DateTime(), nullable=True), + sa.Column('login_count', sa.Integer(), nullable=True), + sa.Column('fail_login_count', sa.Integer(), nullable=True), + sa.Column('created_on', sa.DateTime(), nullable=True), + sa.Column('changed_on', sa.DateTime(), nullable=True), + sa.Column('created_by_fk', sa.Integer(), nullable=True), + sa.Column('changed_by_fk', sa.Integer(), nullable=True), + sa.ForeignKeyConstraint(['changed_by_fk'], ['ab_user.id']), + sa.ForeignKeyConstraint(['created_by_fk'], ['ab_user.id']), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('email'), + sa.UniqueConstraint('username'), + ) + + if "ab_user_role" not in tables: + op.create_table( + 'ab_user_role', + sa.Column('id', sa.Integer(), nullable=False, primary_key=True), + sa.Column('user_id', sa.Integer(), nullable=True), + sa.Column('role_id', sa.Integer(), nullable=True), + sa.ForeignKeyConstraint( + ['role_id'], + ['ab_role.id'], + ), + sa.ForeignKeyConstraint( + ['user_id'], + ['ab_user.id'], + ), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('user_id', 'role_id'), + ) + + if "ab_register_user" not in tables: + op.create_table( + 'ab_register_user', + sa.Column('id', sa.Integer(), nullable=False, primary_key=True), + sa.Column('first_name', sa.String(length=64), nullable=False), + sa.Column('last_name', sa.String(length=64), nullable=False), + sa.Column('username', sa.String(length=64), nullable=False), + sa.Column('password', sa.String(length=256), nullable=True), + sa.Column('email', sa.String(length=64), nullable=False), + sa.Column('registration_date', sa.DateTime(), nullable=True), + sa.Column('registration_hash', sa.String(length=256), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('username'), + ) + + +def downgrade(): + """Drop FAB Tables""" + conn = op.get_bind() + inspector = Inspector.from_engine(conn) + tables = inspector.get_table_names() + fab_tables = [ + "ab_permission", + "ab_view_menu", + "ab_role", + "ab_permission_view", + "ab_permission_view_role", + "ab_user", + "ab_user_role", + "ab_register_user", + ] + + for table in fab_tables: + if table in tables: + indexes = inspector.get_foreign_keys(table) + for index in indexes: + if conn.dialect.name != "sqlite": + op.drop_constraint(index.get('name'), table, type_='foreignkey') + + for table in fab_tables: + if table in tables: + if conn.dialect.name == "sqlite": + op.execute("PRAGMA foreign_keys=off") + op.drop_table(table) + op.execute("PRAGMA foreign_keys=on") + else: + op.drop_table(table) diff --git a/airflow/migrations/versions/952da73b5eff_add_dag_code_table.py b/airflow/migrations/versions/952da73b5eff_add_dag_code_table.py index 87792c46952c2..81d6ec9fdf833 100644 --- a/airflow/migrations/versions/952da73b5eff_add_dag_code_table.py +++ b/airflow/migrations/versions/952da73b5eff_add_dag_code_table.py @@ -29,7 +29,6 @@ # revision identifiers, used by Alembic. from airflow.models.dagcode import DagCode -from airflow.models.serialized_dag import SerializedDagModel revision = '952da73b5eff' down_revision = '852ae6c715af' @@ -38,6 +37,18 @@ def upgrade(): + from sqlalchemy.ext.declarative import declarative_base + + Base = declarative_base() + + class SerializedDagModel(Base): + __tablename__ = 'serialized_dag' + + # There are other columns here, but these are the only ones we need for the SELECT/UPDATE we are doing + dag_id = sa.Column(sa.String(250), primary_key=True) + fileloc = sa.Column(sa.String(2000), nullable=False) + fileloc_hash = sa.Column(sa.BigInteger, nullable=False) + """Apply add source code table""" op.create_table('dag_code', # pylint: disable=no-member sa.Column('fileloc_hash', sa.BigInteger(), @@ -48,11 +59,13 @@ def upgrade(): conn = op.get_bind() if conn.dialect.name not in ('sqlite'): - op.drop_index('idx_fileloc_hash', 'serialized_dag') + if conn.dialect.name == "mssql": + op.drop_index('idx_fileloc_hash', 'serialized_dag') + op.alter_column(table_name='serialized_dag', column_name='fileloc_hash', type_=sa.BigInteger(), nullable=False) - op.create_index( # pylint: disable=no-member - 'idx_fileloc_hash', 'serialized_dag', ['fileloc_hash']) + if conn.dialect.name == "mssql": + op.create_index('idx_fileloc_hash', 'serialized_dag', ['fileloc_hash']) sessionmaker = sa.orm.sessionmaker() session = sessionmaker(bind=conn) diff --git a/airflow/migrations/versions/a66efa278eea_add_precision_to_execution_date_in_mysql.py b/airflow/migrations/versions/a66efa278eea_add_precision_to_execution_date_in_mysql.py new file mode 100644 index 0000000000000..59098a8908dcf --- /dev/null +++ b/airflow/migrations/versions/a66efa278eea_add_precision_to_execution_date_in_mysql.py @@ -0,0 +1,61 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Add Precision to execution_date in RenderedTaskInstanceFields table + +Revision ID: a66efa278eea +Revises: 8f966b9c467a +Create Date: 2020-06-16 21:44:02.883132 + +""" + +from alembic import op +from sqlalchemy.dialects import mysql + +# revision identifiers, used by Alembic. +revision = 'a66efa278eea' +down_revision = '952da73b5eff' +branch_labels = None +depends_on = None + +TABLE_NAME = 'rendered_task_instance_fields' +COLUMN_NAME = 'execution_date' + + +def upgrade(): + """Add Precision to execution_date in RenderedTaskInstanceFields table for MySQL""" + conn = op.get_bind() + if conn.dialect.name == "mysql": + op.alter_column( + table_name=TABLE_NAME, + column_name=COLUMN_NAME, + type_=mysql.TIMESTAMP(fsp=6), + nullable=False + ) + + +def downgrade(): + """Unapply Add Precision to execution_date in RenderedTaskInstanceFields table""" + conn = op.get_bind() + if conn.dialect.name == "mysql": + op.alter_column( + table_name=TABLE_NAME, + column_name=COLUMN_NAME, + type_=mysql.TIMESTAMP(), + nullable=False + ) diff --git a/airflow/migrations/versions/b3b105409875_add_root_dag_id_to_dag.py b/airflow/migrations/versions/b3b105409875_add_root_dag_id_to_dag.py index dc8260a8d65e6..a193f9fed9706 100644 --- a/airflow/migrations/versions/b3b105409875_add_root_dag_id_to_dag.py +++ b/airflow/migrations/versions/b3b105409875_add_root_dag_id_to_dag.py @@ -17,9 +17,11 @@ # under the License. """add root_dag_id to DAG + Revision ID: b3b105409875 Revises: d38e04c12aa2 Create Date: 2019-09-28 23:20:01.744775 + """ import sqlalchemy as sa diff --git a/airflow/migrations/versions/d38e04c12aa2_add_serialized_dag_table.py b/airflow/migrations/versions/d38e04c12aa2_add_serialized_dag_table.py index 52b5f7a28cea5..fb3eca4fe1a0c 100644 --- a/airflow/migrations/versions/d38e04c12aa2_add_serialized_dag_table.py +++ b/airflow/migrations/versions/d38e04c12aa2_add_serialized_dag_table.py @@ -17,9 +17,11 @@ # under the License. """add serialized_dag table + Revision ID: d38e04c12aa2 Revises: 6e96a59344a4 Create Date: 2019-08-01 14:39:35.616417 + """ from alembic import op from sqlalchemy.dialects import mysql diff --git a/airflow/migrations/versions/da3f683c3a5a_add_dag_hash_column_to_serialized_dag_.py b/airflow/migrations/versions/da3f683c3a5a_add_dag_hash_column_to_serialized_dag_.py new file mode 100644 index 0000000000000..4acda3b39eb6d --- /dev/null +++ b/airflow/migrations/versions/da3f683c3a5a_add_dag_hash_column_to_serialized_dag_.py @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Add dag_hash Column to serialized_dag table + +Revision ID: da3f683c3a5a +Revises: 8d48763f6d53 +Create Date: 2020-08-07 20:52:09.178296 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = 'da3f683c3a5a' +down_revision = 'a66efa278eea' +branch_labels = None +depends_on = None + + +def upgrade(): + """Apply Add dag_hash Column to serialized_dag table""" + op.add_column( + 'serialized_dag', + sa.Column('dag_hash', sa.String(32), nullable=False, server_default='Hash not calculated yet')) + + +def downgrade(): + """Unapply Add dag_hash Column to serialized_dag table""" + op.drop_column('serialized_dag', 'dag_hash') diff --git a/airflow/models/baseoperator.py b/airflow/models/baseoperator.py index 6abf9c128d887..1dcd628d8280b 100644 --- a/airflow/models/baseoperator.py +++ b/airflow/models/baseoperator.py @@ -45,6 +45,7 @@ from airflow.models.taskinstance import TaskInstance, clear_task_instances from airflow.models.xcom import XCOM_RETURN_KEY from airflow.ti_deps.deps.not_in_retry_period_dep import NotInRetryPeriodDep +from airflow.ti_deps.deps.not_previously_skipped_dep import NotPreviouslySkippedDep from airflow.ti_deps.deps.prev_dagrun_dep import PrevDagrunDep from airflow.ti_deps.deps.trigger_rule_dep import TriggerRuleDep from airflow.utils import timezone @@ -227,9 +228,9 @@ class derived from this one results in the creation of a task object, MyOperator(..., executor_config={ - "KubernetesExecutor": - {"image": "myCustomDockerImage"} - } + "KubernetesExecutor": + {"image": "myCustomDockerImage"} + } ) :type executor_config: dict @@ -428,8 +429,8 @@ def __init__( self._log = logging.getLogger("airflow.task.operators") # lineage - self.inlets = [] # type: Iterable[DataSet] - self.outlets = [] # type: Iterable[DataSet] + self.inlets = [] # type: List[DataSet] + self.outlets = [] # type: List[DataSet] self.lineage_data = None self._inlets = { @@ -546,7 +547,8 @@ def dag(self, dag): "The DAG assigned to {} can not be changed.".format(self)) elif self.task_id not in dag.task_dict: dag.add_task(self) - + elif self.task_id in dag.task_dict and dag.task_dict[self.task_id] is not self: + dag.add_task(self) self._dag = dag def has_dag(self): @@ -574,6 +576,7 @@ def deps(self): NotInRetryPeriodDep(), PrevDagrunDep(), TriggerRuleDep(), + NotPreviouslySkippedDep(), } @property @@ -584,7 +587,7 @@ def schedule_interval(self): schedule_interval as it may not be attached to a DAG. """ if self.has_dag(): - return self.dag._schedule_interval + return self.dag.normalized_schedule_interval else: return self._schedule_interval @@ -873,14 +876,11 @@ def clear(self, tasks += [ t.task_id for t in self.get_flat_relatives(upstream=False)] - qry = qry.filter(TI.task_id.in_(tasks)) - - count = qry.count() - - clear_task_instances(qry.all(), session, dag=self.dag) - + qry = qry.filter(TaskInstance.task_id.in_(tasks)) + results = qry.all() + count = len(results) + clear_task_instances(results, session, dag=self.dag) session.commit() - return count @provide_session diff --git a/airflow/models/connection.py b/airflow/models/connection.py index 6b9c5074bbd6f..4fac12c0914c3 100644 --- a/airflow/models/connection.py +++ b/airflow/models/connection.py @@ -109,6 +109,8 @@ class Connection(Base, LoggingMixin): ('grpc', 'GRPC Connection'), ('rabbitmq', 'AMQP Connection(rabbitmq)'), ('kafka', 'Kafka'), + ('yandexcloud', 'Yandex Cloud'), + ('spark', 'Spark'), ] def __init__( diff --git a/airflow/models/crypto.py b/airflow/models/crypto.py index abc7d2d24f481..79e6ab3c87eca 100644 --- a/airflow/models/crypto.py +++ b/airflow/models/crypto.py @@ -19,9 +19,12 @@ from builtins import ImportError as BuiltinImportError +import logging + from airflow.configuration import conf from airflow.exceptions import AirflowException -from airflow.utils.log.logging_mixin import LoggingMixin + +log = logging.getLogger(__name__) class InvalidFernetToken(Exception): @@ -62,7 +65,6 @@ def get_fernet(): :raises: airflow.exceptions.AirflowException if there's a problem trying to load Fernet """ global _fernet - log = LoggingMixin().log if _fernet: return _fernet diff --git a/airflow/models/dag.py b/airflow/models/dag.py index b2758bf45651d..a1908e34e7a3a 100644 --- a/airflow/models/dag.py +++ b/airflow/models/dag.py @@ -21,6 +21,7 @@ import copy import functools +import logging import os import pickle import re @@ -68,6 +69,8 @@ install_aliases() +log = logging.getLogger(__name__) + ScheduleInterval = Union[str, timedelta, relativedelta] @@ -215,13 +218,13 @@ class DAG(BaseDag, LoggingMixin): def __init__( self, dag_id, # type: str - description='', # type: str + description=None, # type: Optional[str] schedule_interval=timedelta(days=1), # type: Optional[ScheduleInterval] start_date=None, # type: Optional[datetime] end_date=None, # type: Optional[datetime] full_filepath=None, # type: Optional[str] template_searchpath=None, # type: Optional[Union[str, Iterable[str]]] - template_undefined=jinja2.Undefined, # type: Type[jinja2.Undefined] + template_undefined=None, # type: Optional[Type[jinja2.Undefined]] user_defined_macros=None, # type: Optional[Dict] user_defined_filters=None, # type: Optional[Dict] default_args=None, # type: Optional[Dict] @@ -262,7 +265,8 @@ def __init__( self._description = description # set file location to caller source path - self.fileloc = sys._getframe().f_back.f_code.co_filename + back = sys._getframe().f_back + self.fileloc = back.f_code.co_filename if back else "" self.task_dict = dict() # type: Dict[str, BaseOperator] # set timezone from start_date @@ -299,12 +303,6 @@ def __init__( ) self.schedule_interval = schedule_interval - if isinstance(schedule_interval, six.string_types) and schedule_interval in cron_presets: - self._schedule_interval = cron_presets.get(schedule_interval) # type: Optional[ScheduleInterval] - elif schedule_interval == '@once': - self._schedule_interval = None - else: - self._schedule_interval = schedule_interval if isinstance(template_searchpath, six.string_types): template_searchpath = [template_searchpath] self.template_searchpath = template_searchpath @@ -389,7 +387,7 @@ def date_range(self, start_date, num=None, end_date=timezone.utcnow()): end_date = None return utils_date_range( start_date=start_date, end_date=end_date, - num=num, delta=self._schedule_interval) + num=num, delta=self.normalized_schedule_interval) def is_fixed_time_schedule(self): """ @@ -398,7 +396,7 @@ def is_fixed_time_schedule(self): :return: True if the schedule has a fixed time, False if not. """ now = datetime.now() - cron = croniter(self._schedule_interval, now) + cron = croniter(self.normalized_schedule_interval, now) start = cron.get_next(datetime) cron_next = cron.get_next(datetime) @@ -415,12 +413,12 @@ def following_schedule(self, dttm): :param dttm: utc datetime :return: utc datetime """ - if isinstance(self._schedule_interval, six.string_types): + if isinstance(self.normalized_schedule_interval, six.string_types): # we don't want to rely on the transitions created by # croniter as they are not always correct dttm = pendulum.instance(dttm) naive = timezone.make_naive(dttm, self.timezone) - cron = croniter(self._schedule_interval, naive) + cron = croniter(self.normalized_schedule_interval, naive) # We assume that DST transitions happen on the minute/hour if not self.is_fixed_time_schedule(): @@ -433,8 +431,8 @@ def following_schedule(self, dttm): tz = pendulum.timezone(self.timezone.name) following = timezone.make_aware(naive, tz) return timezone.convert_to_utc(following) - elif self._schedule_interval is not None: - return dttm + self._schedule_interval + elif self.normalized_schedule_interval is not None: + return dttm + self.normalized_schedule_interval def previous_schedule(self, dttm): """ @@ -443,12 +441,12 @@ def previous_schedule(self, dttm): :param dttm: utc datetime :return: utc datetime """ - if isinstance(self._schedule_interval, six.string_types): + if isinstance(self.normalized_schedule_interval, six.string_types): # we don't want to rely on the transitions created by # croniter as they are not always correct dttm = pendulum.instance(dttm) naive = timezone.make_naive(dttm, self.timezone) - cron = croniter(self._schedule_interval, naive) + cron = croniter(self.normalized_schedule_interval, naive) # We assume that DST transitions happen on the minute/hour if not self.is_fixed_time_schedule(): @@ -461,8 +459,8 @@ def previous_schedule(self, dttm): tz = pendulum.timezone(self.timezone.name) previous = timezone.make_aware(naive, tz) return timezone.convert_to_utc(previous) - elif self._schedule_interval is not None: - return dttm - self._schedule_interval + elif self.normalized_schedule_interval is not None: + return dttm - self.normalized_schedule_interval def get_run_dates(self, start_date, end_date=None): """ @@ -645,6 +643,25 @@ def is_paused(self): """ return self._get_is_paused() + @property + def normalized_schedule_interval(self): + # type: () -> Optional[ScheduleInterval] + """ + Returns Normalized Schedule Interval. This is used internally by the Scheduler to + schedule DAGs. + + 1. Converts Cron Preset to a Cron Expression (e.g ``@monthly`` to ``0 0 1 * *``) + 2. If Schedule Interval is "@once" return "None" + 3. If not (1) or (2) returns schedule_interval + """ + if isinstance(self.schedule_interval, six.string_types) and self.schedule_interval in cron_presets: + _schedule_interval = cron_presets.get(self.schedule_interval) # type: Optional[ScheduleInterval] + elif self.schedule_interval == '@once': + _schedule_interval = None + else: + _schedule_interval = self.schedule_interval + return _schedule_interval + @provide_session def handle_callback(self, dagrun, success=True, reason=None, session=None): """ @@ -790,7 +807,7 @@ def get_template_env(self): # type: () -> jinja2.Environment # Default values (for backward compatibility) jinja_env_options = { 'loader': jinja2.FileSystemLoader(searchpath), - 'undefined': self.template_undefined, + 'undefined': self.template_undefined or jinja2.Undefined, 'extensions': ["jinja2.ext.do"], 'cache_size': 0 } @@ -1038,7 +1055,7 @@ def clear( instances = tis.all() for ti in instances: if ti.operator == ExternalTaskMarker.__name__: - ti.task = self.get_task(ti.task_id) + ti.task = copy.copy(self.get_task(ti.task_id)) if recursion_depth == 0: # Maximum recursion depth allowed is the recursion_depth of the first @@ -1315,13 +1332,13 @@ def add_task(self, task): elif task.end_date and self.end_date: task.end_date = min(task.end_date, self.end_date) - if task.task_id in self.task_dict: + if task.task_id in self.task_dict and self.task_dict[task.task_id] is not task: # TODO: raise an error in Airflow 2.0 warnings.warn( - 'The requested task could not be added to the DAG because a ' + 'The requested task could not be added to the DAG with dag_id {} because a ' 'task with task_id {} is already in the DAG. Starting in ' 'Airflow 2.0, trying to overwrite a task will raise an ' - 'exception.'.format(task.task_id), + 'exception.'.format(self.dag_id, task.task_id), category=PendingDeprecationWarning) else: self.task_dict[task.task_id] = task @@ -1446,6 +1463,8 @@ def create_dagrun(self, :type start_date: datetime :param external_trigger: whether this dag run is externally triggered :type external_trigger: bool + :param conf: Dict containing configuration/parameters to pass to the DAG + :type conf: dict :param session: database session :type session: sqlalchemy.orm.session.Session """ @@ -1515,7 +1534,7 @@ def sync_to_db(self, owner=None, sync_time=None, session=None): orm_dag.schedule_interval = self.schedule_interval orm_dag.tags = self.get_dagtags(session=session) - if conf.getboolean('core', 'store_dag_code', fallback=False): + if settings.STORE_DAG_CODE: DagCode.bulk_sync_to_db([orm_dag.fileloc]) session.commit() @@ -1587,7 +1606,6 @@ def deactivate_stale_dags(expiration_date, session=None): :type expiration_date: datetime :return: None """ - log = LoggingMixin().log for dag in session.query( DagModel).filter(DagModel.last_scheduler_run < expiration_date, DagModel.is_active).all(): @@ -1677,7 +1695,7 @@ def get_serialized_fields(cls): cls.__serialized_fields = frozenset(vars(DAG(dag_id='test')).keys()) - { 'parent_dag', '_old_context_manager_dags', 'safe_dag_id', 'last_loaded', '_full_filepath', 'user_defined_filters', 'user_defined_macros', - '_schedule_interval', 'partial', '_old_context_manager_dags', + 'partial', '_old_context_manager_dags', '_pickle_id', '_log', 'is_subdag', 'task_dict', 'template_searchpath', 'sla_miss_callback', 'on_success_callback', 'on_failure_callback', 'template_undefined', 'jinja_environment_kwargs' @@ -1693,6 +1711,9 @@ class DagTag(Base): name = Column(String(100), primary_key=True) dag_id = Column(String(ID_LEN), ForeignKey('dag.dag_id'), primary_key=True) + def __repr__(self): + return self.name + class DagModel(Base): @@ -1771,6 +1792,26 @@ def get_last_dagrun(self, session=None, include_externally_triggered=False): return get_last_dagrun(self.dag_id, session=session, include_externally_triggered=include_externally_triggered) + @staticmethod + @provide_session + def get_paused_dag_ids(dag_ids, session): + """ + Given a list of dag_ids, get a set of Paused Dag Ids + + :param dag_ids: List of Dag ids + :param session: ORM Session + :return: Paused Dag_ids + """ + paused_dag_ids = ( + session.query(DagModel.dag_id) + .filter(DagModel.is_paused.is_(True)) + .filter(DagModel.dag_id.in_(dag_ids)) + .all() + ) + + paused_dag_ids = set(paused_dag_id for paused_dag_id, in paused_dag_ids) + return paused_dag_ids + @property def safe_dag_id(self): return self.dag_id.replace('.', '__dot__') @@ -1865,7 +1906,6 @@ def deactivate_deleted_dags(cls, alive_dag_filelocs, session=None): :param alive_dag_filelocs: file paths of alive DAGs :param session: ORM Session """ - log = LoggingMixin().log log.debug("Deactivating DAGs (for which DAG files are deleted) from %s table ", cls.__tablename__) dag_models = session.query(cls).all() diff --git a/airflow/models/dagbag.py b/airflow/models/dagbag.py index 67b44fcded094..f68c4207e1300 100644 --- a/airflow/models/dagbag.py +++ b/airflow/models/dagbag.py @@ -28,7 +28,7 @@ import textwrap import zipfile from collections import namedtuple -from datetime import datetime +from datetime import datetime, timedelta from croniter import CroniterBadCronError, CroniterBadDateError, CroniterNotAlphaError, croniter import six @@ -36,7 +36,7 @@ from airflow import settings from airflow.configuration import conf from airflow.dag.base_dag import BaseDagBag -from airflow.exceptions import AirflowDagCycleException +from airflow.exceptions import AirflowClusterPolicyViolation, AirflowDagCycleException from airflow.executors import get_default_executor from airflow.settings import Stats from airflow.utils import timezone @@ -102,6 +102,7 @@ def __init__( self.import_errors = {} self.has_logged = False self.store_serialized_dags = store_serialized_dags + self.dags_last_fetched = {} self.collect_dags( dag_folder=dag_folder, @@ -127,20 +128,26 @@ def get_dag(self, dag_id): """ from airflow.models.dag import DagModel # Avoid circular import - # Only read DAGs from DB if this dagbag is store_serialized_dags. if self.store_serialized_dags: # Import here so that serialized dag is only imported when serialization is enabled from airflow.models.serialized_dag import SerializedDagModel if dag_id not in self.dags: # Load from DB if not (yet) in the bag - row = SerializedDagModel.get(dag_id) - if not row: - return None - - dag = row.dag - for subdag in dag.subdags: - self.dags[subdag.dag_id] = subdag - self.dags[dag.dag_id] = dag + self._add_dag_from_db(dag_id=dag_id) + return self.dags.get(dag_id) + + # If DAG is in the DagBag, check the following + # 1. if time has come to check if DAG is updated (controlled by min_serialized_dag_fetch_secs) + # 2. check the last_updated column in SerializedDag table to see if Serialized DAG is updated + # 3. if (2) is yes, fetch the Serialized DAG. + min_serialized_dag_fetch_secs = timedelta(seconds=settings.MIN_SERIALIZED_DAG_FETCH_INTERVAL) + if ( + dag_id in self.dags_last_fetched and + timezone.utcnow() > self.dags_last_fetched[dag_id] + min_serialized_dag_fetch_secs + ): + sd_last_updated_datetime = SerializedDagModel.get_last_updated_datetime(dag_id=dag_id) + if sd_last_updated_datetime > self.dags_last_fetched[dag_id]: + self._add_dag_from_db(dag_id=dag_id) return self.dags.get(dag_id) @@ -178,6 +185,19 @@ def get_dag(self, dag_id): del self.dags[dag_id] return self.dags.get(dag_id) + def _add_dag_from_db(self, dag_id): + """Add DAG to DagBag from DB""" + from airflow.models.serialized_dag import SerializedDagModel + row = SerializedDagModel.get(dag_id) + if not row: + raise ValueError("DAG '{}' not found in serialized_dag table".format(dag_id)) + + dag = row.dag + for subdag in dag.subdags: + self.dags[subdag.dag_id] = subdag + self.dags[dag.dag_id] = dag + self.dags_last_fetched[dag.dag_id] = timezone.utcnow() + def process_file(self, filepath, only_if_updated=True, safe_mode=True): """ Given a path to a python module or zip file, this method imports @@ -285,8 +305,8 @@ def process_file(self, filepath, only_if_updated=True, safe_mode=True): try: dag.is_subdag = False self.bag_dag(dag, parent_dag=dag, root_dag=dag) - if isinstance(dag._schedule_interval, six.string_types): - croniter(dag._schedule_interval) + if isinstance(dag.normalized_schedule_interval, six.string_types): + croniter(dag.normalized_schedule_interval) found_dags.append(dag) found_dags += dag.subdags except (CroniterBadCronError, @@ -297,9 +317,10 @@ def process_file(self, filepath, only_if_updated=True, safe_mode=True): "Invalid Cron expression: " + str(cron_e) self.file_last_changed[dag.full_filepath] = \ file_last_changed_on_disk - except AirflowDagCycleException as cycle_exception: + except (AirflowDagCycleException, + AirflowClusterPolicyViolation) as exception: self.log.exception("Failed to bag_dag: %s", dag.full_filepath) - self.import_errors[dag.full_filepath] = str(cycle_exception) + self.import_errors[dag.full_filepath] = str(exception) self.file_last_changed[dag.full_filepath] = \ file_last_changed_on_disk @@ -402,8 +423,6 @@ def collect_dags( dag_folder = correct_maybe_zipped(dag_folder) - dags_by_name = {} - for filepath in list_py_file_paths(dag_folder, safe_mode=safe_mode, include_examples=include_examples): try: @@ -415,9 +434,6 @@ def collect_dags( dag_id_names = str(dag_ids) td = timezone.utcnow() - ts - td = td.total_seconds() + ( - float(td.microseconds) / 1000000) - dags_by_name[dag_id_names] = dag_ids stats.append(FileLoadStat( filepath.replace(settings.DAGS_FOLDER, ''), td, @@ -430,13 +446,9 @@ def collect_dags( self.dagbag_stats = sorted( stats, key=lambda x: x.duration, reverse=True) for file_stat in self.dagbag_stats: - dag_ids = dags_by_name[file_stat.dags] - if file_stat.dag_num >= 1: - # if we found multiple dags per file, the stat is 'dag_id1 _ dag_id2' - dag_names = '_'.join(dag_ids) - Stats.timing('dag.loading-duration.{}'. - format(dag_names), - file_stat.duration) + # file_stat.file similar format: /subdir/dag_name.py + filename = file_stat.file.split('/')[-1].replace('.py', '') + Stats.timing('dag.loading-duration.{}'.format(filename), file_stat.duration) def collect_dags_from_db(self): """Collects DAGs from database.""" @@ -473,7 +485,7 @@ def dagbag_report(self): stats = self.dagbag_stats return report.format( dag_folder=self.dag_folder, - duration=sum([o.duration for o in stats]), + duration=sum([o.duration for o in stats], timedelta()).total_seconds(), dag_num=sum([o.dag_num for o in stats]), task_num=sum([o.task_num for o in stats]), table=pprinttable(stats), diff --git a/airflow/models/dagcode.py b/airflow/models/dagcode.py index 12de28f2f94a6..6aa7b6a87e465 100644 --- a/airflow/models/dagcode.py +++ b/airflow/models/dagcode.py @@ -17,13 +17,13 @@ import logging import os import struct -from datetime import datetime, timedelta +from datetime import datetime from sqlalchemy import BigInteger, Column, String, UnicodeText, and_, exists -from airflow.configuration import conf from airflow.exceptions import AirflowException, DagCodeNotFound from airflow.models import Base +from airflow.settings import STORE_DAG_CODE from airflow.utils import timezone from airflow.utils.file import correct_maybe_zipped, open_maybe_zipped from airflow.utils.db import provide_session @@ -123,15 +123,14 @@ def bulk_sync_to_db(cls, filelocs, session=None): session.add(orm_dag_code) for fileloc in existing_filelocs: - old_version = existing_orm_dag_codes_by_fileloc_hashes[ - filelocs_to_hashes[fileloc] - ] - file_modified = datetime.fromtimestamp( - os.path.getmtime(correct_maybe_zipped(fileloc)), tz=timezone.utc) + current_version = existing_orm_dag_codes_by_fileloc_hashes[filelocs_to_hashes[fileloc]] + file_mod_time = datetime.fromtimestamp( + os.path.getmtime(correct_maybe_zipped(fileloc)), tz=timezone.utc + ) - if (file_modified - timedelta(seconds=120)) > old_version.last_updated: + if file_mod_time > current_version.last_updated: orm_dag_code = existing_orm_dag_codes_map[fileloc] - orm_dag_code.last_updated = timezone.utcnow() + orm_dag_code.last_updated = file_mod_time orm_dag_code.source_code = cls._get_code_from_file(orm_dag_code.fileloc) session.merge(orm_dag_code) @@ -179,7 +178,7 @@ def code(cls, fileloc): :return: source code as string """ - if conf.getboolean('core', 'store_dag_code', fallback=False): + if STORE_DAG_CODE: return cls._get_code_from_db(fileloc) else: return cls._get_code_from_file(fileloc) diff --git a/airflow/models/dagrun.py b/airflow/models/dagrun.py index 6620851e8d757..acfba4fd481cc 100644 --- a/airflow/models/dagrun.py +++ b/airflow/models/dagrun.py @@ -23,13 +23,14 @@ Column, Integer, String, Boolean, PickleType, Index, UniqueConstraint, func, DateTime, or_, and_ ) +from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.declarative import declared_attr from sqlalchemy.orm import synonym from sqlalchemy.orm.session import Session from airflow.exceptions import AirflowException -from airflow.models.base import Base, ID_LEN -from airflow.settings import Stats -from airflow.ti_deps.dep_context import DepContext +from airflow.models.base import ID_LEN, Base +from airflow.settings import Stats, task_instance_mutation_hook +from airflow.ti_deps.dep_context import SCHEDULEABLE_STATES, DepContext from airflow.utils import timezone from airflow.utils.db import provide_session from airflow.utils.log.logging_mixin import LoggingMixin @@ -188,7 +189,6 @@ def get_task_instances(self, state=None, session=None): if self.dag and self.dag.partial: tis = tis.filter(TaskInstance.task_id.in_(self.dag.task_ids)) - return tis.all() @provide_session @@ -256,53 +256,42 @@ def update_state(self, session=None): Determines the overall state of the DagRun based on the state of its TaskInstances. - :return: State + :return: ready_tis: the tis that can be scheduled in the current loop + :rtype ready_tis: list[airflow.models.TaskInstance] """ dag = self.get_dag() - - tis = self.get_task_instances(session=session) - self.log.debug("Updating state for %s considering %s task(s)", self, len(tis)) - + ready_tis = [] + tis = [ti for ti in self.get_task_instances(session=session, + state=State.task_states + (State.SHUTDOWN,))] + self.log.debug("number of tis tasks for %s: %s task(s)", self, len(tis)) for ti in list(tis): - # skip in db? - if ti.state == State.REMOVED: - tis.remove(ti) - else: - ti.task = dag.get_task(ti.task_id) + ti.task = dag.get_task(ti.task_id) - # pre-calculate - # db is faster start_dttm = timezone.utcnow() - unfinished_tasks = self.get_task_instances( - state=State.unfinished(), - session=session - ) + unfinished_tasks = [t for t in tis if t.state in State.unfinished()] + finished_tasks = [t for t in tis if t.state in State.finished() + [State.UPSTREAM_FAILED]] none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.task_concurrency is None for t in unfinished_tasks) # small speed up - if unfinished_tasks and none_depends_on_past and none_task_concurrency: - # todo: this can actually get pretty slow: one task costs between 0.01-015s - no_dependencies_met = True - for ut in unfinished_tasks: - # We need to flag upstream and check for changes because upstream - # failures/re-schedules can result in deadlock false positives - old_state = ut.state - deps_met = ut.are_dependencies_met( - dep_context=DepContext( - flag_upstream_failed=True, - ignore_in_retry_period=True, - ignore_in_reschedule_period=True), - session=session) - if deps_met or old_state != ut.current_state(session=session): - no_dependencies_met = False - break - - duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000 + if unfinished_tasks: + scheduleable_tasks = [ut for ut in unfinished_tasks if ut.state in SCHEDULEABLE_STATES] + self.log.debug( + "number of scheduleable tasks for %s: %s task(s)", + self, len(scheduleable_tasks)) + ready_tis, changed_tis = self._get_ready_tis(scheduleable_tasks, finished_tasks, session) + self.log.debug("ready tis length for %s: %s task(s)", self, len(ready_tis)) + if none_depends_on_past and none_task_concurrency: + # small speed up + are_runnable_tasks = ready_tis or self._are_premature_tis( + unfinished_tasks, finished_tasks, session) or changed_tis + + duration = (timezone.utcnow() - start_dttm) Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) - leaf_tis = [ti for ti in tis if ti.task_id in {t.task_id for t in dag.leaves}] + leaf_task_ids = {t.task_id for t in dag.leaves} + leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids] # if all roots finished and at least one failed, the run failed if not unfinished_tasks and any( @@ -323,7 +312,7 @@ def update_state(self, session=None): # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and - none_task_concurrency and no_dependencies_met): + none_task_concurrency and not are_runnable_tasks): self.log.info('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', @@ -333,13 +322,83 @@ def update_state(self, session=None): else: self.set_state(State.RUNNING) + self._emit_true_scheduling_delay_stats_for_finished_state(finished_tasks) self._emit_duration_stats_for_finished_state() # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) session.commit() - return self.state + return ready_tis + + def _get_ready_tis(self, scheduleable_tasks, finished_tasks, session): + ready_tis = [] + changed_tis = False + for st in scheduleable_tasks: + st_old_state = st.state + if st.are_dependencies_met( + dep_context=DepContext( + flag_upstream_failed=True, + finished_tasks=finished_tasks), + session=session): + ready_tis.append(st) + elif st_old_state != st.current_state(session=session): + changed_tis = True + return ready_tis, changed_tis + + def _are_premature_tis(self, unfinished_tasks, finished_tasks, session): + # there might be runnable tasks that are up for retry and from some reason(retry delay, etc) are + # not ready yet so we set the flags to count them in + for ut in unfinished_tasks: + if ut.are_dependencies_met( + dep_context=DepContext( + flag_upstream_failed=True, + ignore_in_retry_period=True, + ignore_in_reschedule_period=True, + finished_tasks=finished_tasks), + session=session): + return True + + def _emit_true_scheduling_delay_stats_for_finished_state(self, finished_tis): + """ + This is a helper method to emit the true scheduling delay stats, which is defined as + the time when the first task in DAG starts minus the expected DAG run datetime. + This method will be used in the update_state method when the state of the DagRun + is updated to a completed status (either success or failure). The method will find the first + started task within the DAG and calculate the expected DagRun start time (based on + dag.execution_date & dag.schedule_interval), and minus these two values to get the delay. + The emitted data may contains outlier (e.g. when the first task was cleared, so + the second task's start_date will be used), but we can get rid of the the outliers + on the stats side through the dashboards tooling built. + Note, the stat will only be emitted if the DagRun is a scheduler triggered one + (i.e. external_trigger is False). + """ + if self.state == State.RUNNING: + return + if self.external_trigger: + return + if not finished_tis: + return + + try: + dag = self.get_dag() + + if not self.dag.schedule_interval or self.dag.schedule_interval == "@once": + # We can't emit this metric if there is no following schedule to cacluate from! + return + + ordered_tis_by_start_date = [ti for ti in finished_tis if ti.start_date] + ordered_tis_by_start_date.sort(key=lambda ti: ti.start_date, reverse=False) + first_start_date = ordered_tis_by_start_date[0].start_date + if first_start_date: + # dag.following_schedule calculates the expected start datetime for a scheduled dagrun + # i.e. a daily flow for execution date 1/1/20 actually runs on 1/2/20 hh:mm:ss, + # and ti.start_date will be 1/2/20 hh:mm:ss so the following schedule is comparison + true_delay = first_start_date - dag.following_schedule(self.execution_date) + if true_delay.total_seconds() > 0: + Stats.timing('dagrun.{}.first_task_scheduling_delay'.format(dag.dag_id), true_delay) + except Exception as e: + self.log.warning('Failed to record first_task_scheduling_delay metric:\n', e) def _emit_duration_stats_for_finished_state(self): if self.state == State.RUNNING: @@ -363,9 +422,10 @@ def verify_integrity(self, session=None): tis = self.get_task_instances(session=session) # check for removed or restored tasks - task_ids = [] + task_ids = set() for ti in tis: - task_ids.append(ti.task_id) + task_instance_mutation_hook(ti) + task_ids.add(ti.task_id) task = None try: task = dag.get_task(ti.task_id) @@ -386,6 +446,7 @@ def verify_integrity(self, session=None): "removed from DAG '{}'".format(ti, dag)) Stats.incr("task_restored_to_dag.{}".format(dag.dag_id), 1, 1) ti.state = State.NONE + session.merge(ti) # check for missing tasks for task in six.itervalues(dag.task_dict): @@ -397,9 +458,19 @@ def verify_integrity(self, session=None): "task_instance_created-{}".format(task.__class__.__name__), 1, 1) ti = TaskInstance(task, self.execution_date) + task_instance_mutation_hook(ti) session.add(ti) - session.commit() + try: + session.commit() + except IntegrityError as err: + self.log.info(str(err)) + self.log.info( + 'Hit IntegrityError while creating the TIs for %s - %s', + dag.dag_id, self.execution_date + ) + self.log.info('Doing session rollback.') + session.rollback() @staticmethod def get_run(session, dag_id, execution_date): diff --git a/airflow/models/serialized_dag.py b/airflow/models/serialized_dag.py index 9709dba6563d3..f33e67b90bc85 100644 --- a/airflow/models/serialized_dag.py +++ b/airflow/models/serialized_dag.py @@ -17,13 +17,16 @@ # specific language governing permissions and limitations # under the License. -"""Serialzed DAG table in database.""" +"""Serialized DAG table in database.""" +import hashlib +import logging from datetime import timedelta from typing import Any, Optional import sqlalchemy_jsonfield from sqlalchemy import BigInteger, Column, Index, String, and_ +from sqlalchemy.orm import Session # noqa: F401 from sqlalchemy.sql import exists from airflow.models.base import ID_LEN, Base @@ -32,11 +35,9 @@ from airflow.serialization.serialized_objects import SerializedDAG from airflow.settings import json from airflow.utils import db, timezone -from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.sqlalchemy import UtcDateTime - -log = LoggingMixin().log +log = logging.getLogger(__name__) class SerializedDagModel(Base): @@ -53,7 +54,7 @@ class SerializedDagModel(Base): interval of deleting serialized DAGs in DB when the files are deleted, suggest to use a smaller interval such as 60 - It is used by webserver to load dagbags when ``store_serialized_dags=True``. + It is used by webserver to load dags when ``store_serialized_dags=True``. Because reading from database is lightweight compared to importing from files, it solves the webserver scalability issue. """ @@ -65,6 +66,7 @@ class SerializedDagModel(Base): fileloc_hash = Column(BigInteger, nullable=False) data = Column(sqlalchemy_jsonfield.JSONField(json=json), nullable=False) last_updated = Column(UtcDateTime, nullable=False) + dag_hash = Column(String(32), nullable=False) __table_args__ = ( Index('idx_fileloc_hash', fileloc_hash, unique=False), @@ -76,6 +78,10 @@ def __init__(self, dag): self.fileloc_hash = DagCode.dag_fileloc_hash(self.fileloc) self.data = SerializedDAG.to_dict(dag) self.last_updated = timezone.utcnow() + self.dag_hash = hashlib.md5(json.dumps(self.data, sort_keys=True).encode("utf-8")).hexdigest() + + def __repr__(self): + return "".format(self.dag_id) @classmethod @db.provide_session @@ -84,12 +90,13 @@ def write_dag(cls, min_update_interval=None, # type: Optional[int] session=None): """Serializes a DAG and writes it into database. + If the record already exists, it checks if the Serialized DAG changed or not. If it is + changed, it updates the record, ignores otherwise. :param dag: a DAG to be written into database :param min_update_interval: minimal interval in seconds to update serialized DAG :param session: ORM Session """ - log.debug("Writing DAG: %s to the DB", dag) # Checks if (Current Time - Time when the DAG was written to DB) < min_update_interval # If Yes, does nothing # If No or the DAG does not exists, updates / writes Serialized DAG to DB @@ -100,8 +107,17 @@ def write_dag(cls, ).scalar(): return - log.debug("Writing DAG: %s to the DB", dag.dag_id) - session.merge(cls(dag)) + log.debug("Checking if DAG (%s) changed", dag.dag_id) + new_serialized_dag = cls(dag) + serialized_dag_hash_from_db = session.query( + cls.dag_hash).filter(cls.dag_id == dag.dag_id).scalar() + + if serialized_dag_hash_from_db == new_serialized_dag.dag_hash: + log.debug("Serialized DAG (%s) is unchanged. Skipping writing to DB", dag.dag_id) + return + + log.debug("Writing Serialized DAG: %s to the DB", dag.dag_id) + session.merge(new_serialized_dag) log.debug("DAG: %s written to the DB", dag.dag_id) @classmethod @@ -110,6 +126,7 @@ def read_all_dags(cls, session=None): """Reads all DAGs in serialized_dag table. :param session: ORM Session + :type session: Session :returns: a dict of DAGs read from database """ serialized_dags = session.query(cls) @@ -146,6 +163,7 @@ def remove_dag(cls, dag_id, session=None): :param dag_id: dag_id to be deleted :type dag_id: str :param session: ORM Session + :type session: Session """ session.execute(cls.__table__.delete().where(cls.dag_id == dag_id)) @@ -157,6 +175,7 @@ def remove_deleted_dags(cls, alive_dag_filelocs, session=None): :param alive_dag_filelocs: file paths of alive DAGs :type alive_dag_filelocs: list :param session: ORM Session + :type session: Session """ alive_fileloc_hashes = [ DagCode.dag_fileloc_hash(fileloc) for fileloc in alive_dag_filelocs] @@ -177,6 +196,7 @@ def has_dag(cls, dag_id, session=None): :param dag_id: the DAG to check :type dag_id: str :param session: ORM Session + :type session: Session :rtype: bool """ return session.query(exists().where(cls.dag_id == dag_id)).scalar() @@ -191,6 +211,7 @@ def get(cls, dag_id, session=None): :param dag_id: the DAG to fetch :param session: ORM Session + :type session: Session """ from airflow.models.dag import DagModel row = session.query(cls).filter(cls.dag_id == dag_id).one_or_none() @@ -203,3 +224,17 @@ def get(cls, dag_id, session=None): DagModel.root_dag_id).filter(DagModel.dag_id == dag_id).scalar() return session.query(cls).filter(cls.dag_id == root_dag_id).one_or_none() + + @classmethod + @db.provide_session + def get_last_updated_datetime(cls, dag_id, session): + """ + Get the date when the Serialized DAG associated to DAG was last updated + in serialized_dag table + + :param dag_id: DAG ID + :type dag_id: str + :param session: ORM Session + :type session: Session + """ + return session.query(cls.last_updated).filter(cls.dag_id == dag_id).scalar() diff --git a/airflow/models/skipmixin.py b/airflow/models/skipmixin.py index 57341d8fb4eab..a65d4848aaf4d 100644 --- a/airflow/models/skipmixin.py +++ b/airflow/models/skipmixin.py @@ -19,28 +19,27 @@ from airflow.models.taskinstance import TaskInstance from airflow.utils import timezone -from airflow.utils.db import provide_session +from airflow.utils.db import create_session, provide_session from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.state import State import six -from typing import Union, Iterable, Set + +# The key used by SkipMixin to store XCom data. +XCOM_SKIPMIXIN_KEY = "skipmixin_key" + +# The dictionary key used to denote task IDs that are skipped +XCOM_SKIPMIXIN_SKIPPED = "skipped" + +# The dictionary key used to denote task IDs that are followed +XCOM_SKIPMIXIN_FOLLOWED = "followed" class SkipMixin(LoggingMixin): - @provide_session - def skip(self, dag_run, execution_date, tasks, session=None): + def _set_state_to_skipped(self, dag_run, execution_date, tasks, session): """ - Sets tasks instances to skipped from the same dag run. - - :param dag_run: the DagRun for which to set the tasks to skipped - :param execution_date: execution_date - :param tasks: tasks to skip (not task_ids) - :param session: db session to use + Used internally to set state of task instances to skipped from the same dag run. """ - if not tasks: - return - task_ids = [d.task_id for d in tasks] now = timezone.utcnow() @@ -48,12 +47,15 @@ def skip(self, dag_run, execution_date, tasks, session=None): session.query(TaskInstance).filter( TaskInstance.dag_id == dag_run.dag_id, TaskInstance.execution_date == dag_run.execution_date, - TaskInstance.task_id.in_(task_ids) - ).update({TaskInstance.state: State.SKIPPED, - TaskInstance.start_date: now, - TaskInstance.end_date: now}, - synchronize_session=False) - session.commit() + TaskInstance.task_id.in_(task_ids), + ).update( + { + TaskInstance.state: State.SKIPPED, + TaskInstance.start_date: now, + TaskInstance.end_date: now, + }, + synchronize_session=False, + ) else: assert execution_date is not None, "Execution date is None and no dag run" @@ -66,18 +68,61 @@ def skip(self, dag_run, execution_date, tasks, session=None): ti.end_date = now session.merge(ti) - session.commit() + @provide_session + def skip( + self, dag_run, execution_date, tasks, session=None, + ): + """ + Sets tasks instances to skipped from the same dag run. - def skip_all_except(self, ti, branch_task_ids): - # type: (TaskInstance, Union[str, Iterable[str]]) -> None + If this instance has a `task_id` attribute, store the list of skipped task IDs to XCom + so that NotPreviouslySkippedDep knows these tasks should be skipped when they + are cleared. + + :param dag_run: the DagRun for which to set the tasks to skipped + :param execution_date: execution_date + :param tasks: tasks to skip (not task_ids) + :param session: db session to use + """ + if not tasks: + return + + self._set_state_to_skipped(dag_run, execution_date, tasks, session) + session.commit() + + # SkipMixin may not necessarily have a task_id attribute. Only store to XCom if one is available. + try: + task_id = self.task_id + except AttributeError: + task_id = None + + if task_id is not None: + from airflow.models.xcom import XCom + + XCom.set( + key=XCOM_SKIPMIXIN_KEY, + value={XCOM_SKIPMIXIN_SKIPPED: [d.task_id for d in tasks]}, + task_id=task_id, + dag_id=dag_run.dag_id, + execution_date=dag_run.execution_date, + session=session + ) + + def skip_all_except( + self, ti, branch_task_ids + ): """ This method implements the logic for a branching operator; given a single task ID or list of task IDs to follow, this skips all other tasks immediately downstream of this operator. + + branch_task_ids is stored to XCom so that NotPreviouslySkippedDep knows skipped tasks or + newly added tasks should be skipped when they are cleared. """ self.log.info("Following branch %s", branch_task_ids) if isinstance(branch_task_ids, six.string_types): - branch_task_ids = [branch_task_ids] + branch_task_ids = {branch_task_ids} + branch_task_ids = set(branch_task_ids) dag_run = ti.get_dagrun() task = ti.task @@ -86,17 +131,38 @@ def skip_all_except(self, ti, branch_task_ids): downstream_tasks = task.downstream_list if downstream_tasks: - # Also check downstream tasks of the branch task. In case the task to skip - # is also a downstream task of the branch task, we exclude it from skipping. - branch_downstream_task_ids = set() # type: Set[str] - for b in branch_task_ids: - branch_downstream_task_ids.update(dag. - get_task(b). - get_flat_relative_ids(upstream=False)) - - skip_tasks = [t for t in downstream_tasks - if t.task_id not in branch_task_ids and - t.task_id not in branch_downstream_task_ids] + # For a branching workflow that looks like this, when "branch" does skip_all_except("task1"), + # we intuitively expect both "task1" and "join" to execute even though strictly speaking, + # "join" is also immediately downstream of "branch" and should have been skipped. Therefore, + # we need a special case here for such empty branches: Check downstream tasks of branch_task_ids. + # In case the task to skip is also downstream of branch_task_ids, we add it to branch_task_ids and + # exclude it from skipping. + # + # branch -----> join + # \ ^ + # v / + # task1 + # + for branch_task_id in list(branch_task_ids): + branch_task_ids.update( + dag.get_task(branch_task_id).get_flat_relative_ids(upstream=False) + ) + + skip_tasks = [ + t + for t in downstream_tasks + if t.task_id not in branch_task_ids + ] + follow_task_ids = [t.task_id for t in downstream_tasks if t.task_id in branch_task_ids] self.log.info("Skipping tasks %s", [t.task_id for t in skip_tasks]) - self.skip(dag_run, ti.execution_date, skip_tasks) + with create_session() as session: + self._set_state_to_skipped( + dag_run, ti.execution_date, skip_tasks, session=session + ) + # For some reason, session.commit() needs to happen before xcom_push. + # Otherwise the session is not committed. + session.commit() + ti.xcom_push( + key=XCOM_SKIPMIXIN_KEY, value={XCOM_SKIPMIXIN_FOLLOWED: follow_task_ids} + ) diff --git a/airflow/models/taskinstance.py b/airflow/models/taskinstance.py index 242cfe38bf552..a9459dad66d33 100644 --- a/airflow/models/taskinstance.py +++ b/airflow/models/taskinstance.py @@ -43,7 +43,8 @@ from airflow import settings from airflow.configuration import conf from airflow.exceptions import ( - AirflowException, AirflowTaskTimeout, AirflowSkipException, AirflowRescheduleException + AirflowException, AirflowFailException, AirflowRescheduleException, AirflowSkipException, + AirflowTaskTimeout, ) from airflow.models.base import Base, ID_LEN from airflow.models.log import Log @@ -511,7 +512,7 @@ def refresh_from_task(self, task, pool_override=None): self.run_as_user = task.run_as_user self.max_tries = task.retries self.executor_config = task.executor_config - self.operator = task.__class__.__name__ + self.operator = task.task_type @provide_session def clear_xcom_data(self, session=None): @@ -963,8 +964,8 @@ def signal_handler(signum, frame): self.render_templates(context=context) if STORE_SERIALIZED_DAGS: - RTIF.write(RTIF(ti=self, render_templates=False), session=session) - RTIF.delete_old_records(self.task_id, self.dag_id, session=session) + RTIF.write(RTIF(ti=self, render_templates=False)) + RTIF.delete_old_records(self.task_id, self.dag_id) task_copy.pre_execute(context=context) @@ -989,7 +990,7 @@ def signal_handler(signum, frame): task_copy.post_execute(context=context, result=result) end_time = time.time() - duration = end_time - start_time + duration = timedelta(seconds=end_time - start_time) Stats.timing( 'dag.{dag_id}.{task_id}.duration'.format( dag_id=task_copy.dag_id, @@ -1026,6 +1027,10 @@ def signal_handler(signum, frame): self.refresh_from_db() self._handle_reschedule(actual_start_date, reschedule_exception, test_mode, context) return + except AirflowFailException as e: + self.refresh_from_db() + self.handle_failure(e, test_mode, context, force_fail=True) + raise except AirflowException as e: self.refresh_from_db() # for case when task is marked as success/failed externally @@ -1136,7 +1141,7 @@ def _handle_reschedule(self, actual_start_date, reschedule_exception, test_mode= self.log.info('Rescheduling task, marking task as UP_FOR_RESCHEDULE') @provide_session - def handle_failure(self, error, test_mode=None, context=None, session=None): + def handle_failure(self, error, test_mode=None, context=None, force_fail=False, session=None): if test_mode is None: test_mode = self.test_mode if context is None: @@ -1157,64 +1162,51 @@ def handle_failure(self, error, test_mode=None, context=None, session=None): if context is not None: context['exception'] = error - # Let's go deeper - try: - # Since this function is called only when the TI state is running, - # try_number contains the current try_number (not the next). We - # only mark task instance as FAILED if the next task instance - # try_number exceeds the max_tries. - if self.is_eligible_to_retry(): - self.state = State.UP_FOR_RETRY - self.log.info('Marking task as UP_FOR_RETRY') - if task.email_on_retry and task.email: - self.email_alert(error) + # Set state correctly and figure out how to log it, + # what callback to call if any, and how to decide whether to email + + # Since this function is called only when the TaskInstance state is running, + # try_number contains the current try_number (not the next). We + # only mark task instance as FAILED if the next task instance + # try_number exceeds the max_tries ... or if force_fail is truthy + + if force_fail or not self.is_eligible_to_retry(): + self.state = State.FAILED + if force_fail: + log_message = "Immediate failure requested. Marking task as FAILED." else: - self.state = State.FAILED - if task.retries: - self.log.info( - 'All retries failed; marking task as FAILED.' - 'dag_id=%s, task_id=%s, execution_date=%s, start_date=%s, end_date=%s', - self.dag_id, - self.task_id, - self.execution_date.strftime('%Y%m%dT%H%M%S') if hasattr( - self, - 'execution_date') and self.execution_date else '', - self.start_date.strftime('%Y%m%dT%H%M%S') if hasattr( - self, - 'start_date') and self.start_date else '', - self.end_date.strftime('%Y%m%dT%H%M%S') if hasattr( - self, - 'end_date') and self.end_date else '') - else: - self.log.info( - 'Marking task as FAILED.' - 'dag_id=%s, task_id=%s, execution_date=%s, start_date=%s, end_date=%s', - self.dag_id, - self.task_id, - self.execution_date.strftime('%Y%m%dT%H%M%S') if hasattr( - self, - 'execution_date') and self.execution_date else '', - self.start_date.strftime('%Y%m%dT%H%M%S') if hasattr( - self, - 'start_date') and self.start_date else '', - self.end_date.strftime('%Y%m%dT%H%M%S') if hasattr( - self, - 'end_date') and self.end_date else '') - if task.email_on_failure and task.email: - self.email_alert(error) - except Exception as e2: - self.log.error('Failed to send email to: %s', task.email) - self.log.exception(e2) + log_message = "Marking task as FAILED." + email_for_state = task.email_on_failure + callback = task.on_failure_callback + else: + self.state = State.UP_FOR_RETRY + log_message = "Marking task as UP_FOR_RETRY." + email_for_state = task.email_on_retry + callback = task.on_retry_callback + + self.log.info( + '%s dag_id=%s, task_id=%s, execution_date=%s, start_date=%s, end_date=%s', + log_message, + self.dag_id, + self.task_id, + self._safe_date('execution_date', '%Y%m%dT%H%M%S'), + self._safe_date('start_date', '%Y%m%dT%H%M%S'), + self._safe_date('end_date', '%Y%m%dT%H%M%S') + ) + if email_for_state and task.email: + try: + self.email_alert(error) + except Exception as e2: + self.log.error('Failed to send email to: %s', task.email) + self.log.exception(e2) # Handling callbacks pessimistically - try: - if self.state == State.UP_FOR_RETRY and task.on_retry_callback: - task.on_retry_callback(context) - if self.state == State.FAILED and task.on_failure_callback: - task.on_failure_callback(context) - except Exception as e3: - self.log.error("Failed at executing callback") - self.log.exception(e3) + if callback: + try: + callback(context) + except Exception as e3: + self.log.error("Failed at executing callback") + self.log.exception(e3) if not test_mode: session.merge(self) @@ -1224,6 +1216,12 @@ def is_eligible_to_retry(self): """Is task instance is eligible for retry""" return self.task.retries and self.try_number <= self.max_tries + def _safe_date(self, date_attr, fmt): + result = getattr(self, date_attr, None) + if result is not None: + return result.strftime(fmt) + return '' + @provide_session def get_template_context(self, session=None): task = self.task diff --git a/airflow/models/variable.py b/airflow/models/variable.py index 9cc237112338a..0eff002b918c6 100644 --- a/airflow/models/variable.py +++ b/airflow/models/variable.py @@ -46,18 +46,17 @@ def __repr__(self): return '{} : {}'.format(self.key, self._val) def get_val(self): - log = LoggingMixin().log if self._val is not None and self.is_encrypted: try: fernet = get_fernet() return fernet.decrypt(bytes(self._val, 'utf-8')).decode() except InvalidFernetToken: - log.error("Can't decrypt _val for key={}, invalid token " - "or value".format(self.key)) + self.log.error("Can't decrypt _val for key={}, invalid token " + "or value".format(self.key)) return None except Exception: - log.error("Can't decrypt _val for key={}, FERNET_KEY " - "configuration missing".format(self.key)) + self.log.error("Can't decrypt _val for key={}, FERNET_KEY " + "configuration missing".format(self.key)) return None else: return self._val diff --git a/airflow/models/xcom.py b/airflow/models/xcom.py index 434cb7151f5dc..e33c5e564c469 100644 --- a/airflow/models/xcom.py +++ b/airflow/models/xcom.py @@ -18,6 +18,7 @@ # under the License. import json +import logging import pickle from sqlalchemy import Column, Integer, String, Index, LargeBinary, and_ @@ -31,6 +32,7 @@ from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.sqlalchemy import UtcDateTime +log = logging.getLogger(__name__) # MAX XCOM Size is 48KB # https://github.com/apache/airflow/pull/1618#discussion_r68249677 @@ -38,7 +40,7 @@ XCOM_RETURN_KEY = 'return_value' -class XCom(Base, LoggingMixin): +class BaseXCom(Base, LoggingMixin): """ Base class for XCom objects. """ @@ -65,17 +67,13 @@ class XCom(Base, LoggingMixin): """ @reconstructor def init_on_load(self): - enable_pickling = conf.getboolean('core', 'enable_xcom_pickling') - if enable_pickling: + try: + self.value = self.deserialize_value(self) + except (UnicodeEncodeError, ValueError): + # For backward-compatibility. + # Preventing errors in webserver + # due to XComs mixed with pickled and unpickled. self.value = pickle.loads(self.value) - else: - try: - self.value = json.loads(self.value.decode('UTF-8')) - except (UnicodeEncodeError, ValueError): - # For backward-compatibility. - # Preventing errors in webserver - # due to XComs mixed with pickled and unpickled. - self.value = pickle.loads(self.value) def __repr__(self): return ''.format( @@ -162,7 +160,6 @@ def get_one(cls, try: return json.loads(result.value.decode('UTF-8')) except ValueError: - log = LoggingMixin().log log.error("Could not deserialize the XCOM value from JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " @@ -226,9 +223,42 @@ def serialize_value(value): try: return json.dumps(value).encode('UTF-8') except ValueError: - log = LoggingMixin().log log.error("Could not serialize the XCOM value into JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " "support for XCOM in your airflow config.") raise + + @staticmethod + def deserialize_value(result): + # TODO: "pickling" has been deprecated and JSON is preferred. + # "pickling" will be removed in Airflow 2.0. + enable_pickling = conf.getboolean('core', 'enable_xcom_pickling') + if enable_pickling: + return pickle.loads(result.value) + + try: + return json.loads(result.value.decode('UTF-8')) + except ValueError: + log.error("Could not deserialize the XCOM value from JSON. " + "If you are using pickles instead of JSON " + "for XCOM, then you need to enable pickle " + "support for XCOM in your airflow config.") + raise + + +def resolve_xcom_backend(): + """Resolves custom XCom class""" + clazz = conf.getimport("core", "xcom_backend", fallback="airflow.models.xcom.{}" + .format(BaseXCom.__name__)) + if clazz: + if not issubclass(clazz, BaseXCom): + raise TypeError( + "Your custom XCom class `{class_name}` is not a subclass of `{base_name}`." + .format(class_name=clazz.__name__, base_name=BaseXCom.__name__) + ) + return clazz + return BaseXCom + + +XCom = resolve_xcom_backend() diff --git a/airflow/operators/__init__.py b/airflow/operators/__init__.py index fb5383f0a2f58..a901ab9fd7478 100644 --- a/airflow/operators/__init__.py +++ b/airflow/operators/__init__.py @@ -21,6 +21,9 @@ import os from airflow.models import BaseOperator # noqa: F401 + +PY37 = sys.version_info >= (3, 7) + # ------------------------------------------------------------------------ # # #TODO #FIXME Airflow 2.0 @@ -101,9 +104,14 @@ def _integrate_plugins(): """Integrate plugins to the context""" - from airflow.plugins_manager import operators_modules, register_inbuilt_operator_links + from airflow.plugins_manager import operators_modules for operators_module in operators_modules: + sys.modules[operators_module.__name__] = operators_module + if not PY37: + from pep562 import Pep562 + operators_module = Pep562(operators_module.__name__) + globals()[operators_module._name] = operators_module ########################################################## @@ -121,5 +129,3 @@ def _integrate_plugins(): "import from 'airflow.operators.[plugin_module]' " "instead. Support for direct imports will be dropped " "entirely in Airflow 2.0.".format(i=operator_name)) - - register_inbuilt_operator_links() diff --git a/airflow/operators/bash_operator.py b/airflow/operators/bash_operator.py index ec1058c7fb55b..e27e74656de37 100644 --- a/airflow/operators/bash_operator.py +++ b/airflow/operators/bash_operator.py @@ -33,7 +33,7 @@ class BashOperator(BaseOperator): - """ + r""" Execute a Bash script, command or set of commands. .. seealso:: @@ -53,6 +53,37 @@ class BashOperator(BaseOperator): :type env: dict :param output_encoding: Output encoding of bash command :type output_encoding: str + + .. warning:: + + Care should be taken with "user" input or when using Jinja templates in the + ``bash_command``, as this bash operator does not perform any escaping or + sanitization of the command. + + This applies mostly to using "dag_run" conf, as that can be submitted via + users in the Web UI. Most of the default template variables are not at + risk. + + For example, do **not** do this: + + .. code-block:: python + + bash_task = BashOperator( + task_id="bash_task", + bash_command='echo "Here is the message: \'{{ dag_run.conf["message"] if dag_run else "" }}\'"', + ) + + Instead, you should pass this via the ``env`` kwarg and use double-quotes + inside the bash_command, as below: + + .. code-block:: python + + bash_task = BashOperator( + task_id="bash_task", + bash_command='echo "here is the message: \'$message\'"', + env={'message': '{{ dag_run.conf["message"] if dag_run else "" }}'}, + ) + """ template_fields = ('bash_command', 'env') template_ext = ('.sh', '.bash',) diff --git a/airflow/operators/check_operator.py b/airflow/operators/check_operator.py index b6d3a1872af0c..12ac472d34711 100644 --- a/airflow/operators/check_operator.py +++ b/airflow/operators/check_operator.py @@ -17,409 +17,70 @@ # specific language governing permissions and limitations # under the License. -from builtins import str, zip -from typing import Optional, Any, Iterable, Dict, SupportsAbs +"""This module is deprecated. Please use `airflow.operators.sql`.""" -from airflow.exceptions import AirflowException -from airflow.hooks.base_hook import BaseHook -from airflow.models import BaseOperator -from airflow.utils.decorators import apply_defaults +import warnings +from airflow.operators.sql import ( + SQLCheckOperator, SQLIntervalCheckOperator, SQLThresholdCheckOperator, SQLValueCheckOperator, +) -class CheckOperator(BaseOperator): - """ - Performs checks against a db. The ``CheckOperator`` expects - a sql query that will return a single row. Each value on that - first row is evaluated using python ``bool`` casting. If any of the - values return ``False`` the check is failed and errors out. - - Note that Python bool casting evals the following as ``False``: - - * ``False`` - * ``0`` - * Empty string (``""``) - * Empty list (``[]``) - * Empty dictionary or set (``{}``) - - Given a query like ``SELECT COUNT(*) FROM foo``, it will fail only if - the count ``== 0``. You can craft much more complex query that could, - for instance, check that the table has the same number of rows as - the source table upstream, or that the count of today's partition is - greater than yesterday's partition, or that a set of metrics are less - than 3 standard deviation for the 7 day average. - - This operator can be used as a data quality check in your pipeline, and - depending on where you put it in your DAG, you have the choice to - stop the critical path, preventing from - publishing dubious data, or on the side and receive email alerts - without stopping the progress of the DAG. - Note that this is an abstract class and get_db_hook - needs to be defined. Whereas a get_db_hook is hook that gets a - single record from an external source. - - :param sql: the sql to be executed. (templated) - :type sql: str +class CheckOperator(SQLCheckOperator): """ - - template_fields = ('sql',) # type: Iterable[str] - template_ext = ('.hql', '.sql',) # type: Iterable[str] - ui_color = '#fff7e6' - - @apply_defaults - def __init__( - self, - sql, # type: str - conn_id=None, # type: Optional[str] - *args, - **kwargs - ): - super(CheckOperator, self).__init__(*args, **kwargs) - self.conn_id = conn_id - self.sql = sql - - def execute(self, context=None): - self.log.info('Executing SQL check: %s', self.sql) - records = self.get_db_hook().get_first(self.sql) - - self.log.info('Record: %s', records) - if not records: - raise AirflowException("The query returned None") - elif not all([bool(r) for r in records]): - raise AirflowException("Test failed.\nQuery:\n{query}\nResults:\n{records!s}".format( - query=self.sql, records=records)) - - self.log.info("Success.") - - def get_db_hook(self): - return BaseHook.get_hook(conn_id=self.conn_id) - - -def _convert_to_float_if_possible(s): + This class is deprecated. + Please use `airflow.operators.sql.SQLCheckOperator`. """ - A small helper function to convert a string to a numeric value - if appropriate - :param s: the string to be converted - :type s: str - """ - try: - ret = float(s) - except (ValueError, TypeError): - ret = s - return ret + def __init__(self, *args, **kwargs): + warnings.warn( + """This class is deprecated. + Please use `airflow.operators.sql.SQLCheckOperator`.""", + DeprecationWarning, stacklevel=2 + ) + super(CheckOperator, self).__init__(*args, **kwargs) -class ValueCheckOperator(BaseOperator): +class IntervalCheckOperator(SQLIntervalCheckOperator): """ - Performs a simple value check using sql code. - - Note that this is an abstract class and get_db_hook - needs to be defined. Whereas a get_db_hook is hook that gets a - single record from an external source. - - :param sql: the sql to be executed. (templated) - :type sql: str + This class is deprecated. + Please use `airflow.operators.sql.SQLIntervalCheckOperator`. """ - __mapper_args__ = { - 'polymorphic_identity': 'ValueCheckOperator' - } - template_fields = ('sql', 'pass_value',) # type: Iterable[str] - template_ext = ('.hql', '.sql',) # type: Iterable[str] - ui_color = '#fff7e6' - - @apply_defaults - def __init__( - self, - sql, # type: str - pass_value, # type: Any - tolerance=None, # type: Any - conn_id=None, # type: Optional[str] - *args, - **kwargs - ): - super(ValueCheckOperator, self).__init__(*args, **kwargs) - self.sql = sql - self.conn_id = conn_id - self.pass_value = str(pass_value) - tol = _convert_to_float_if_possible(tolerance) - self.tol = tol if isinstance(tol, float) else None - self.has_tolerance = self.tol is not None - - def execute(self, context=None): - self.log.info('Executing SQL check: %s', self.sql) - records = self.get_db_hook().get_first(self.sql) - - if not records: - raise AirflowException("The query returned None") - - pass_value_conv = _convert_to_float_if_possible(self.pass_value) - is_numeric_value_check = isinstance(pass_value_conv, float) - - tolerance_pct_str = str(self.tol * 100) + '%' if self.has_tolerance else None - error_msg = ("Test failed.\nPass value:{pass_value_conv}\n" - "Tolerance:{tolerance_pct_str}\n" - "Query:\n{sql}\nResults:\n{records!s}").format( - pass_value_conv=pass_value_conv, - tolerance_pct_str=tolerance_pct_str, - sql=self.sql, - records=records + def __init__(self, *args, **kwargs): + warnings.warn( + """This class is deprecated. + Please use `airflow.operators.sql.SQLIntervalCheckOperator`.""", + DeprecationWarning, stacklevel=2 ) - - if not is_numeric_value_check: - tests = self._get_string_matches(records, pass_value_conv) - elif is_numeric_value_check: - try: - numeric_records = self._to_float(records) - except (ValueError, TypeError): - raise AirflowException("Converting a result to float failed.\n{}".format(error_msg)) - tests = self._get_numeric_matches(numeric_records, pass_value_conv) - else: - tests = [] - - if not all(tests): - raise AirflowException(error_msg) - - def _to_float(self, records): - return [float(record) for record in records] - - def _get_string_matches(self, records, pass_value_conv): - return [str(record) == pass_value_conv for record in records] - - def _get_numeric_matches(self, numeric_records, numeric_pass_value_conv): - if self.has_tolerance: - return [ - numeric_pass_value_conv * (1 - self.tol) <= record <= numeric_pass_value_conv * (1 + self.tol) - for record in numeric_records - ] - - return [record == numeric_pass_value_conv for record in numeric_records] - - def get_db_hook(self): - return BaseHook.get_hook(conn_id=self.conn_id) + super(IntervalCheckOperator, self).__init__(*args, **kwargs) -class IntervalCheckOperator(BaseOperator): +class ThresholdCheckOperator(SQLThresholdCheckOperator): """ - Checks that the values of metrics given as SQL expressions are within - a certain tolerance of the ones from days_back before. - - Note that this is an abstract class and get_db_hook - needs to be defined. Whereas a get_db_hook is hook that gets a - single record from an external source. - - :param table: the table name - :type table: str - :param days_back: number of days between ds and the ds we want to check - against. Defaults to 7 days - :type days_back: int - :param ratio_formula: which formula to use to compute the ratio between - the two metrics. Assuming cur is the metric of today and ref is - the metric to today - days_back. - - max_over_min: computes max(cur, ref) / min(cur, ref) - relative_diff: computes abs(cur-ref) / ref - - Default: 'max_over_min' - :type ratio_formula: str - :param ignore_zero: whether we should ignore zero metrics - :type ignore_zero: bool - :param metrics_threshold: a dictionary of ratios indexed by metrics - :type metrics_threshold: dict + This class is deprecated. + Please use `airflow.operators.sql.SQLThresholdCheckOperator`. """ - __mapper_args__ = { - 'polymorphic_identity': 'IntervalCheckOperator' - } - template_fields = ('sql1', 'sql2') # type: Iterable[str] - template_ext = ('.hql', '.sql',) # type: Iterable[str] - ui_color = '#fff7e6' - - ratio_formulas = { - 'max_over_min': lambda cur, ref: float(max(cur, ref)) / min(cur, ref), - 'relative_diff': lambda cur, ref: float(abs(cur - ref)) / ref, - } - - @apply_defaults - def __init__( - self, - table, # type: str - metrics_thresholds, # type: Dict[str, int] - date_filter_column='ds', # type: Optional[str] - days_back=-7, # type: SupportsAbs[int] - ratio_formula='max_over_min', # type: Optional[str] - ignore_zero=True, # type: Optional[bool] - conn_id=None, # type: Optional[str] - *args, **kwargs - ): - super(IntervalCheckOperator, self).__init__(*args, **kwargs) - if ratio_formula not in self.ratio_formulas: - msg_template = "Invalid diff_method: {diff_method}. " \ - "Supported diff methods are: {diff_methods}" - - raise AirflowException( - msg_template.format(diff_method=ratio_formula, - diff_methods=self.ratio_formulas) - ) - self.ratio_formula = ratio_formula - self.ignore_zero = ignore_zero - self.table = table - self.metrics_thresholds = metrics_thresholds - self.metrics_sorted = sorted(metrics_thresholds.keys()) - self.date_filter_column = date_filter_column - self.days_back = -abs(days_back) - self.conn_id = conn_id - sqlexp = ', '.join(self.metrics_sorted) - sqlt = "SELECT {sqlexp} FROM {table} WHERE {date_filter_column}=".format( - sqlexp=sqlexp, table=table, date_filter_column=date_filter_column + def __init__(self, *args, **kwargs): + warnings.warn( + """This class is deprecated. + Please use `airflow.operators.sql.SQLThresholdCheckOperator`.""", + DeprecationWarning, stacklevel=2 ) - - self.sql1 = sqlt + "'{{ ds }}'" - self.sql2 = sqlt + "'{{ macros.ds_add(ds, " + str(self.days_back) + ") }}'" - - def execute(self, context=None): - hook = self.get_db_hook() - self.log.info('Using ratio formula: %s', self.ratio_formula) - self.log.info('Executing SQL check: %s', self.sql2) - row2 = hook.get_first(self.sql2) - self.log.info('Executing SQL check: %s', self.sql1) - row1 = hook.get_first(self.sql1) - - if not row2: - raise AirflowException("The query {} returned None".format(self.sql2)) - if not row1: - raise AirflowException("The query {} returned None".format(self.sql1)) - - current = dict(zip(self.metrics_sorted, row1)) - reference = dict(zip(self.metrics_sorted, row2)) - - ratios = {} - test_results = {} - - for m in self.metrics_sorted: - cur = current[m] - ref = reference[m] - threshold = self.metrics_thresholds[m] - if cur == 0 or ref == 0: - ratios[m] = None - test_results[m] = self.ignore_zero - else: - ratios[m] = self.ratio_formulas[self.ratio_formula](current[m], reference[m]) - test_results[m] = ratios[m] < threshold - - self.log.info( - ( - "Current metric for %s: %s\n" - "Past metric for %s: %s\n" - "Ratio for %s: %s\n" - "Threshold: %s\n" - ), m, cur, m, ref, m, ratios[m], threshold) - - if not all(test_results.values()): - failed_tests = [it[0] for it in test_results.items() if not it[1]] - j = len(failed_tests) - n = len(self.metrics_sorted) - self.log.warning("The following %s tests out of %s failed:", j, n) - for k in failed_tests: - self.log.warning( - "'%s' check failed. %s is above %s", k, ratios[k], self.metrics_thresholds[k] - ) - raise AirflowException("The following tests have failed:\n {0}".format(", ".join( - sorted(failed_tests)))) - - self.log.info("All tests have passed") - - def get_db_hook(self): - return BaseHook.get_hook(conn_id=self.conn_id) + super(ThresholdCheckOperator, self).__init__(*args, **kwargs) -class ThresholdCheckOperator(BaseOperator): +class ValueCheckOperator(SQLValueCheckOperator): """ - Performs a value check using sql code against a mininmum threshold - and a maximum threshold. Thresholds can be in the form of a numeric - value OR a sql statement that results a numeric. - - Note that this is an abstract class and get_db_hook - needs to be defined. Whereas a get_db_hook is hook that gets a - single record from an external source. - - :param sql: the sql to be executed. (templated) - :type sql: str - :param min_threshold: numerical value or min threshold sql to be executed (templated) - :type min_threshold: numeric or str - :param max_threshold: numerical value or max threshold sql to be executed (templated) - :type max_threshold: numeric or str + This class is deprecated. + Please use `airflow.operators.sql.SQLValueCheckOperator`. """ - template_fields = ('sql', 'min_threshold', 'max_threshold') # type: Iterable[str] - template_ext = ('.hql', '.sql',) # type: Iterable[str] - - @apply_defaults - def __init__( - self, - sql, # type: str - min_threshold, # type: Any - max_threshold, # type: Any - conn_id=None, # type: Optional[str] - *args, **kwargs - ): - super(ThresholdCheckOperator, self).__init__(*args, **kwargs) - self.sql = sql - self.conn_id = conn_id - self.min_threshold = _convert_to_float_if_possible(min_threshold) - self.max_threshold = _convert_to_float_if_possible(max_threshold) - - def execute(self, context=None): - hook = self.get_db_hook() - result = hook.get_first(self.sql)[0][0] - - if isinstance(self.min_threshold, float): - lower_bound = self.min_threshold - else: - lower_bound = hook.get_first(self.min_threshold)[0][0] - - if isinstance(self.max_threshold, float): - upper_bound = self.max_threshold - else: - upper_bound = hook.get_first(self.max_threshold)[0][0] - - meta_data = { - "result": result, - "task_id": self.task_id, - "min_threshold": lower_bound, - "max_threshold": upper_bound, - "within_threshold": lower_bound <= result <= upper_bound - } - - self.push(meta_data) - if not meta_data["within_threshold"]: - error_msg = ( - 'Threshold Check: "{task_id}" failed.\n' - 'DAG: {dag_id}\nTask_id: {task_id}\n' - 'Check description: {description}\n' - 'SQL: {sql}\n' - 'Result: {result} is not within thresholds ' - '{min_threshold} and {max_threshold}' - ).format( - task_id=self.task_id, dag_id=self.dag_id, - description=meta_data.get("description"), sql=self.sql, - result=round(meta_data.get("result"), 2), - min_threshold=meta_data.get("min_threshold"), - max_threshold=meta_data.get("max_threshold") - ) - raise AirflowException(error_msg) - - self.log.info("Test %s Successful.", self.task_id) - - def push(self, meta_data): - """ - Optional: Send data check info and metadata to an external database. - Default functionality will log metadata. - """ - - info = "\n".join(["""{}: {}""".format(key, item) for key, item in meta_data.items()]) - self.log.info("Log from %s:\n%s", self.dag_id, info) - - def get_db_hook(self): - return BaseHook.get_hook(conn_id=self.conn_id) + def __init__(self, *args, **kwargs): + warnings.warn( + """This class is deprecated. + Please use `airflow.operators.sql.SQLValueCheckOperator`.""", + DeprecationWarning, stacklevel=2 + ) + super(ValueCheckOperator, self).__init__(*args, **kwargs) diff --git a/airflow/operators/docker_operator.py b/airflow/operators/docker_operator.py index cb335d3dd9fea..d0a872a2a8491 100644 --- a/airflow/operators/docker_operator.py +++ b/airflow/operators/docker_operator.py @@ -19,9 +19,10 @@ """ Implements Docker operator """ -import json import ast + +import six from docker import APIClient, tls from airflow.hooks.docker_hook import DockerHook @@ -265,9 +266,10 @@ def execute(self, context): # Pull the docker image if `force_pull` is set or image does not exist locally if self.force_pull or len(self.cli.images(name=self.image)) == 0: self.log.info('Pulling docker image %s', self.image) - for l in self.cli.pull(self.image, stream=True, decode=True): - output = json.loads(l.decode('utf-8').strip()) - if 'status' in output: + for output in self.cli.pull(self.image, stream=True, decode=True): + if isinstance(output, six.string_types): + self.log.info("%s", output) + if isinstance(output, dict) and 'status' in output: self.log.info("%s", output['status']) self.environment['AIRFLOW_TMP_DIR'] = self.tmp_dir diff --git a/airflow/operators/hive_operator.py b/airflow/operators/hive_operator.py index 9cebc386c3192..0d8bca75a52e2 100644 --- a/airflow/operators/hive_operator.py +++ b/airflow/operators/hive_operator.py @@ -19,8 +19,11 @@ from __future__ import unicode_literals +import os import re +from airflow.utils import operator_helpers + from airflow.hooks.hive_hooks import HiveCliHook from airflow.models import BaseOperator from airflow.utils.decorators import apply_defaults @@ -134,9 +137,21 @@ def execute(self, context): self.hook.run_cli(hql=self.hql, schema=self.schema, hive_conf=self.hiveconfs) def dry_run(self): + # Reset airflow environment variables to prevent + # existing env vars from impacting behavior. + self.clear_airflow_vars() + self.hook = self.get_hook() self.hook.test_hql(hql=self.hql) def on_kill(self): if self.hook: self.hook.kill() + + def clear_airflow_vars(self): + """ + Reset airflow environment variables to prevent existing ones from impacting behavior. + """ + blank_env_vars = {value['env_var_format']: '' for value in + operator_helpers.AIRFLOW_VAR_NAME_FORMAT_MAPPING.values()} + os.environ.update(blank_env_vars) diff --git a/airflow/operators/hive_stats_operator.py b/airflow/operators/hive_stats_operator.py index fe8ff9dd632fc..8648845d39f0a 100644 --- a/airflow/operators/hive_stats_operator.py +++ b/airflow/operators/hive_stats_operator.py @@ -20,6 +20,7 @@ from builtins import zip from collections import OrderedDict import json +import warnings from airflow.exceptions import AirflowException from airflow.hooks.mysql_hook import MySqlHook @@ -49,9 +50,9 @@ class HiveStatsCollectionOperator(BaseOperator): :param extra_exprs: dict of expression to run against the table where keys are metric names and values are Presto compatible expressions :type extra_exprs: dict - :param col_blacklist: list of columns to blacklist, consider - blacklisting blobs, large json columns, ... - :type col_blacklist: list + :param excluded_columns: list of columns to exclude, consider + excluding blobs, large json columns, ... + :type excluded_columns: list :param assignment_func: a function that receives a column name and a type, and returns a dict of metric names and an Presto expressions. If None is returned, the global defaults are applied. If an @@ -69,18 +70,27 @@ def __init__( table, partition, extra_exprs=None, - col_blacklist=None, + excluded_columns=None, assignment_func=None, metastore_conn_id='metastore_default', presto_conn_id='presto_default', mysql_conn_id='airflow_db', *args, **kwargs): - super(HiveStatsCollectionOperator, self).__init__(*args, **kwargs) + if 'col_blacklist' in kwargs: + warnings.warn( + 'col_blacklist kwarg passed to {c} (task_id: {t}) is deprecated, please rename it to ' + 'excluded_columns instead'.format( + c=self.__class__.__name__, t=kwargs.get('task_id')), + category=FutureWarning, + stacklevel=2 + ) + excluded_columns = kwargs.pop('col_blacklist') + super(HiveStatsCollectionOperator, self).__init__(*args, **kwargs) self.table = table self.partition = partition self.extra_exprs = extra_exprs or {} - self.col_blacklist = col_blacklist or {} + self.excluded_columns = excluded_columns or {} self.metastore_conn_id = metastore_conn_id self.presto_conn_id = presto_conn_id self.mysql_conn_id = mysql_conn_id @@ -89,7 +99,7 @@ def __init__( self.dttm = '{{ execution_date.isoformat() }}' def get_default_exprs(self, col, col_type): - if col in self.col_blacklist: + if col in self.excluded_columns: return {} d = {(col, 'non_null'): "COUNT({col})"} if col_type in ['double', 'int', 'bigint', 'float', 'double']: diff --git a/airflow/operators/hive_to_samba_operator.py b/airflow/operators/hive_to_samba_operator.py index 7963524a106e3..e48f96f3f57b1 100644 --- a/airflow/operators/hive_to_samba_operator.py +++ b/airflow/operators/hive_to_samba_operator.py @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -import tempfile +from tempfile import NamedTemporaryFile from airflow.hooks.hive_hooks import HiveServer2Hook from airflow.hooks.samba_hook import SambaHook @@ -59,9 +59,9 @@ def __init__( def execute(self, context): samba = SambaHook(samba_conn_id=self.samba_conn_id) hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) - tmpfile = tempfile.NamedTemporaryFile() - self.log.info("Fetching file from Hive") - hive.to_csv(hql=self.hql, csv_filepath=tmpfile.name, - hive_conf=context_to_airflow_vars(context)) - self.log.info("Pushing to samba") - samba.push_from_local(self.destination_filepath, tmpfile.name) + with NamedTemporaryFile() as tmp_file: + self.log.info("Fetching file from Hive") + hive.to_csv(hql=self.hql, csv_filepath=tmp_file.name, + hive_conf=context_to_airflow_vars(context)) + self.log.info("Pushing to samba") + samba.push_from_local(self.destination_filepath, tmp_file.name) diff --git a/airflow/operators/python_operator.py b/airflow/operators/python_operator.py index 78b6a410e9892..a6f5ffdc09c07 100644 --- a/airflow/operators/python_operator.py +++ b/airflow/operators/python_operator.py @@ -234,8 +234,8 @@ def __init__( python_version=None, # type: Optional[str] use_dill=False, # type: bool system_site_packages=True, # type: bool - op_args=None, # type: Iterable - op_kwargs=None, # type: Dict + op_args=None, # type: Optional[Iterable] + op_kwargs=None, # type: Optional[Dict] provide_context=False, # type: bool string_args=None, # type: Optional[Iterable[str]] templates_dict=None, # type: Optional[Dict] @@ -330,13 +330,19 @@ def _write_string_args(self, filename): def _write_args(self, input_filename): # serialize args to file + if self.use_dill: + serializer = dill + else: + serializer = pickle + # some items from context can't be loaded in virtual env + # see pr https://github.com/apache/airflow/pull/8256 + not_serializable = {'dag', 'task', 'ti', 'macros', 'task_instance', 'var'} if self._pass_op_args(): + kwargs = {key: value for key, value in self.op_kwargs.items() + if key not in not_serializable} with open(input_filename, 'wb') as f: - arg_dict = ({'args': self.op_args, 'kwargs': self.op_kwargs}) - if self.use_dill: - dill.dump(arg_dict, f) - else: - pickle.dump(arg_dict, f) + arg_dict = ({'args': self.op_args, 'kwargs': kwargs}) + serializer.dump(arg_dict, f) def _read_result(self, output_filename): if os.stat(output_filename).st_size == 0: diff --git a/airflow/operators/sql.py b/airflow/operators/sql.py new file mode 100644 index 0000000000000..dbc28dc68f623 --- /dev/null +++ b/airflow/operators/sql.py @@ -0,0 +1,637 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import six +from distutils.util import strtobool +from typing import Iterable + +from airflow.exceptions import AirflowException +from airflow.hooks.base_hook import BaseHook +from airflow.models import BaseOperator, SkipMixin +from airflow.utils.decorators import apply_defaults + +ALLOWED_CONN_TYPE = { + "google_cloud_platform", + "jdbc", + "mssql", + "mysql", + "odbc", + "oracle", + "postgres", + "presto", + "snowflake", + "sqlite", + "vertica", +} + + +class SQLCheckOperator(BaseOperator): + """ + Performs checks against a db. The ``SQLCheckOperator`` expects + a sql query that will return a single row. Each value on that + first row is evaluated using python ``bool`` casting. If any of the + values return ``False`` the check is failed and errors out. + + Note that Python bool casting evals the following as ``False``: + + * ``False`` + * ``0`` + * Empty string (``""``) + * Empty list (``[]``) + * Empty dictionary or set (``{}``) + + Given a query like ``SELECT COUNT(*) FROM foo``, it will fail only if + the count ``== 0``. You can craft much more complex query that could, + for instance, check that the table has the same number of rows as + the source table upstream, or that the count of today's partition is + greater than yesterday's partition, or that a set of metrics are less + than 3 standard deviation for the 7 day average. + + This operator can be used as a data quality check in your pipeline, and + depending on where you put it in your DAG, you have the choice to + stop the critical path, preventing from + publishing dubious data, or on the side and receive email alerts + without stopping the progress of the DAG. + + Note that this is an abstract class and get_db_hook + needs to be defined. Whereas a get_db_hook is hook that gets a + single record from an external source. + + :param sql: the sql to be executed. (templated) + :type sql: str + """ + + template_fields = ("sql",) # type: Iterable[str] + template_ext = ( + ".hql", + ".sql", + ) # type: Iterable[str] + ui_color = "#fff7e6" + + @apply_defaults + def __init__( + self, sql, conn_id=None, *args, **kwargs + ): + super(SQLCheckOperator, self).__init__(*args, **kwargs) + self.conn_id = conn_id + self.sql = sql + + def execute(self, context=None): + self.log.info("Executing SQL check: %s", self.sql) + records = self.get_db_hook().get_first(self.sql) + + self.log.info("Record: %s", records) + if not records: + raise AirflowException("The query returned None") + elif not all([bool(r) for r in records]): + raise AirflowException( + "Test failed.\nQuery:\n{query}\nResults:\n{records!s}".format( + query=self.sql, records=records + ) + ) + + self.log.info("Success.") + + def get_db_hook(self): + """ + Get the database hook for the connection. + + :return: the database hook object. + :rtype: DbApiHook + """ + return BaseHook.get_hook(conn_id=self.conn_id) + + +def _convert_to_float_if_possible(s): + """ + A small helper function to convert a string to a numeric value + if appropriate + + :param s: the string to be converted + :type s: str + """ + try: + ret = float(s) + except (ValueError, TypeError): + ret = s + return ret + + +class SQLValueCheckOperator(BaseOperator): + """ + Performs a simple value check using sql code. + + Note that this is an abstract class and get_db_hook + needs to be defined. Whereas a get_db_hook is hook that gets a + single record from an external source. + + :param sql: the sql to be executed. (templated) + :type sql: str + """ + + __mapper_args__ = {"polymorphic_identity": "SQLValueCheckOperator"} + template_fields = ( + "sql", + "pass_value", + ) # type: Iterable[str] + template_ext = ( + ".hql", + ".sql", + ) # type: Iterable[str] + ui_color = "#fff7e6" + + @apply_defaults + def __init__( + self, + sql, + pass_value, + tolerance=None, + conn_id=None, + *args, + **kwargs): + super(SQLValueCheckOperator, self).__init__(*args, **kwargs) + self.sql = sql + self.conn_id = conn_id + self.pass_value = str(pass_value) + tol = _convert_to_float_if_possible(tolerance) + self.tol = tol if isinstance(tol, float) else None + self.has_tolerance = self.tol is not None + + def execute(self, context=None): + self.log.info("Executing SQL check: %s", self.sql) + records = self.get_db_hook().get_first(self.sql) + + if not records: + raise AirflowException("The query returned None") + + pass_value_conv = _convert_to_float_if_possible(self.pass_value) + is_numeric_value_check = isinstance(pass_value_conv, float) + + tolerance_pct_str = str(self.tol * 100) + "%" if self.has_tolerance else None + error_msg = ( + "Test failed.\nPass value:{pass_value_conv}\n" + "Tolerance:{tolerance_pct_str}\n" + "Query:\n{sql}\nResults:\n{records!s}" + ).format( + pass_value_conv=pass_value_conv, + tolerance_pct_str=tolerance_pct_str, + sql=self.sql, + records=records, + ) + + if not is_numeric_value_check: + tests = self._get_string_matches(records, pass_value_conv) + elif is_numeric_value_check: + try: + numeric_records = self._to_float(records) + except (ValueError, TypeError): + raise AirflowException( + "Converting a result to float failed.\n{}".format(error_msg) + ) + tests = self._get_numeric_matches(numeric_records, pass_value_conv) + else: + tests = [] + + if not all(tests): + raise AirflowException(error_msg) + + def _to_float(self, records): + return [float(record) for record in records] + + def _get_string_matches(self, records, pass_value_conv): + return [str(record) == pass_value_conv for record in records] + + def _get_numeric_matches(self, numeric_records, numeric_pass_value_conv): + if self.has_tolerance: + return [ + numeric_pass_value_conv * (1 - self.tol) <= record <= numeric_pass_value_conv * (1 + self.tol) + for record in numeric_records + ] + + return [record == numeric_pass_value_conv for record in numeric_records] + + def get_db_hook(self): + """ + Get the database hook for the connection. + + :return: the database hook object. + :rtype: DbApiHook + """ + return BaseHook.get_hook(conn_id=self.conn_id) + + +class SQLIntervalCheckOperator(BaseOperator): + """ + Checks that the values of metrics given as SQL expressions are within + a certain tolerance of the ones from days_back before. + + Note that this is an abstract class and get_db_hook + needs to be defined. Whereas a get_db_hook is hook that gets a + single record from an external source. + + :param table: the table name + :type table: str + :param days_back: number of days between ds and the ds we want to check + against. Defaults to 7 days + :type days_back: int + :param ratio_formula: which formula to use to compute the ratio between + the two metrics. Assuming cur is the metric of today and ref is + the metric to today - days_back. + + max_over_min: computes max(cur, ref) / min(cur, ref) + relative_diff: computes abs(cur-ref) / ref + + Default: 'max_over_min' + :type ratio_formula: str + :param ignore_zero: whether we should ignore zero metrics + :type ignore_zero: bool + :param metrics_threshold: a dictionary of ratios indexed by metrics + :type metrics_threshold: dict + """ + + __mapper_args__ = {"polymorphic_identity": "SQLIntervalCheckOperator"} + template_fields = ("sql1", "sql2") # type: Iterable[str] + template_ext = ( + ".hql", + ".sql", + ) # type: Iterable[str] + ui_color = "#fff7e6" + + ratio_formulas = { + "max_over_min": lambda cur, ref: float(max(cur, ref)) / min(cur, ref), + "relative_diff": lambda cur, ref: float(abs(cur - ref)) / ref, + } + + @apply_defaults + def __init__( + self, + table, + metrics_thresholds, + date_filter_column="ds", + days_back=-7, + ratio_formula="max_over_min", + ignore_zero=True, + conn_id=None, + *args, + **kwargs + ): + super(SQLIntervalCheckOperator, self).__init__(*args, **kwargs) + if ratio_formula not in self.ratio_formulas: + msg_template = ( + "Invalid diff_method: {diff_method}. " + "Supported diff methods are: {diff_methods}" + ) + + raise AirflowException( + msg_template.format( + diff_method=ratio_formula, diff_methods=self.ratio_formulas + ) + ) + self.ratio_formula = ratio_formula + self.ignore_zero = ignore_zero + self.table = table + self.metrics_thresholds = metrics_thresholds + self.metrics_sorted = sorted(metrics_thresholds.keys()) + self.date_filter_column = date_filter_column + self.days_back = -abs(days_back) + self.conn_id = conn_id + sqlexp = ", ".join(self.metrics_sorted) + sqlt = "SELECT {sqlexp} FROM {table} WHERE {date_filter_column}=".format( + sqlexp=sqlexp, table=table, date_filter_column=date_filter_column + ) + + self.sql1 = sqlt + "'{{ ds }}'" + self.sql2 = sqlt + "'{{ macros.ds_add(ds, " + str(self.days_back) + ") }}'" + + def execute(self, context=None): + hook = self.get_db_hook() + self.log.info("Using ratio formula: %s", self.ratio_formula) + self.log.info("Executing SQL check: %s", self.sql2) + row2 = hook.get_first(self.sql2) + self.log.info("Executing SQL check: %s", self.sql1) + row1 = hook.get_first(self.sql1) + + if not row2: + raise AirflowException("The query {} returned None".format(self.sql2)) + if not row1: + raise AirflowException("The query {} returned None".format(self.sql1)) + + current = dict(zip(self.metrics_sorted, row1)) + reference = dict(zip(self.metrics_sorted, row2)) + + ratios = {} + test_results = {} + + for metric in self.metrics_sorted: + cur = current[metric] + ref = reference[metric] + threshold = self.metrics_thresholds[metric] + if cur == 0 or ref == 0: + ratios[metric] = None + test_results[metric] = self.ignore_zero + else: + ratios[metric] = self.ratio_formulas[self.ratio_formula]( + current[metric], reference[metric] + ) + test_results[metric] = ratios[metric] < threshold + + self.log.info( + ( + "Current metric for %s: %s\n" + "Past metric for %s: %s\n" + "Ratio for %s: %s\n" + "Threshold: %s\n" + ), + metric, + cur, + metric, + ref, + metric, + ratios[metric], + threshold, + ) + + if not all(test_results.values()): + failed_tests = [it[0] for it in test_results.items() if not it[1]] + self.log.warning( + "The following %s tests out of %s failed:", + len(failed_tests), + len(self.metrics_sorted), + ) + for k in failed_tests: + self.log.warning( + "'%s' check failed. %s is above %s", + k, + ratios[k], + self.metrics_thresholds[k], + ) + raise AirflowException( + "The following tests have failed:\n {0}".format( + ", ".join(sorted(failed_tests)) + ) + ) + + self.log.info("All tests have passed") + + def get_db_hook(self): + """ + Get the database hook for the connection. + + :return: the database hook object. + :rtype: DbApiHook + """ + return BaseHook.get_hook(conn_id=self.conn_id) + + +class SQLThresholdCheckOperator(BaseOperator): + """ + Performs a value check using sql code against a mininmum threshold + and a maximum threshold. Thresholds can be in the form of a numeric + value OR a sql statement that results a numeric. + + Note that this is an abstract class and get_db_hook + needs to be defined. Whereas a get_db_hook is hook that gets a + single record from an external source. + + :param sql: the sql to be executed. (templated) + :type sql: str + :param min_threshold: numerical value or min threshold sql to be executed (templated) + :type min_threshold: numeric or str + :param max_threshold: numerical value or max threshold sql to be executed (templated) + :type max_threshold: numeric or str + """ + + template_fields = ("sql", "min_threshold", "max_threshold") # type: Iterable[str] + template_ext = ( + ".hql", + ".sql", + ) # type: Iterable[str] + + @apply_defaults + def __init__( + self, + sql, + min_threshold, + max_threshold, + conn_id=None, + *args, + **kwargs + ): + super(SQLThresholdCheckOperator, self).__init__(*args, **kwargs) + self.sql = sql + self.conn_id = conn_id + self.min_threshold = _convert_to_float_if_possible(min_threshold) + self.max_threshold = _convert_to_float_if_possible(max_threshold) + + def execute(self, context=None): + hook = self.get_db_hook() + result = hook.get_first(self.sql)[0] + + if isinstance(self.min_threshold, float): + lower_bound = self.min_threshold + else: + lower_bound = hook.get_first(self.min_threshold)[0] + + if isinstance(self.max_threshold, float): + upper_bound = self.max_threshold + else: + upper_bound = hook.get_first(self.max_threshold)[0] + + meta_data = { + "result": result, + "task_id": self.task_id, + "min_threshold": lower_bound, + "max_threshold": upper_bound, + "within_threshold": lower_bound <= result <= upper_bound, + } + + self.push(meta_data) + if not meta_data["within_threshold"]: + error_msg = ( + 'Threshold Check: "{task_id}" failed.\n' + 'DAG: {dag_id}\nTask_id: {task_id}\n' + 'Check description: {description}\n' + "SQL: {sql}\n" + 'Result: {round} is not within thresholds ' + '{min} and {max}' + .format(task_id=meta_data.get("task_id"), + dag_id=self.dag_id, + description=meta_data.get("description"), + sql=self.sql, + round=round(meta_data.get("result"), 2), + min=meta_data.get("min_threshold"), + max=meta_data.get("max_threshold"), + )) + raise AirflowException(error_msg) + + self.log.info("Test %s Successful.", self.task_id) + + def push(self, meta_data): + """ + Optional: Send data check info and metadata to an external database. + Default functionality will log metadata. + """ + + info = "\n".join(["{key}: {item}".format(key=key, item=item) for key, item in meta_data.items()]) + self.log.info("Log from %s:\n%s", self.dag_id, info) + + def get_db_hook(self): + """ + Returns DB hook + """ + return BaseHook.get_hook(conn_id=self.conn_id) + + +class BranchSQLOperator(BaseOperator, SkipMixin): + """ + Executes sql code in a specific database + + :param sql: the sql code to be executed. (templated) + :type sql: Can receive a str representing a sql statement or reference to a template file. + Template reference are recognized by str ending in '.sql'. + Expected SQL query to return Boolean (True/False), integer (0 = False, Otherwise = 1) + or string (true/y/yes/1/on/false/n/no/0/off). + :param follow_task_ids_if_true: task id or task ids to follow if query return true + :type follow_task_ids_if_true: str or list + :param follow_task_ids_if_false: task id or task ids to follow if query return true + :type follow_task_ids_if_false: str or list + :param conn_id: reference to a specific database + :type conn_id: str + :param database: name of database which overwrite defined one in connection + :param parameters: (optional) the parameters to render the SQL query with. + :type parameters: mapping or iterable + """ + + template_fields = ("sql",) + template_ext = (".sql",) + ui_color = "#a22034" + ui_fgcolor = "#F7F7F7" + + @apply_defaults + def __init__( + self, + sql, + follow_task_ids_if_true, + follow_task_ids_if_false, + conn_id="default_conn_id", + database=None, + parameters=None, + *args, + **kwargs + ): + super(BranchSQLOperator, self).__init__(*args, **kwargs) + self.conn_id = conn_id + self.sql = sql + self.parameters = parameters + self.follow_task_ids_if_true = follow_task_ids_if_true + self.follow_task_ids_if_false = follow_task_ids_if_false + self.database = database + self._hook = None + + def _get_hook(self): + self.log.debug("Get connection for %s", self.conn_id) + conn = BaseHook.get_connection(self.conn_id) + + if conn.conn_type not in ALLOWED_CONN_TYPE: + raise AirflowException( + "The connection type is not supported by BranchSQLOperator.\ + Supported connection types: {}".format(list(ALLOWED_CONN_TYPE)) + ) + + if not self._hook: + self._hook = conn.get_hook() + if self.database: + self._hook.schema = self.database + + return self._hook + + def execute(self, context): + # get supported hook + self._hook = self._get_hook() + + if self._hook is None: + raise AirflowException( + "Failed to establish connection to '%s'" % self.conn_id + ) + + if self.sql is None: + raise AirflowException("Expected 'sql' parameter is missing.") + + if self.follow_task_ids_if_true is None: + raise AirflowException( + "Expected 'follow_task_ids_if_true' paramter is missing." + ) + + if self.follow_task_ids_if_false is None: + raise AirflowException( + "Expected 'follow_task_ids_if_false' parameter is missing." + ) + + self.log.info( + "Executing: %s (with parameters %s) with connection: %s", + self.sql, + self.parameters, + self._hook, + ) + record = self._hook.get_first(self.sql, self.parameters) + if not record: + raise AirflowException( + "No rows returned from sql query. Operator expected True or False return value." + ) + + if isinstance(record, list): + if isinstance(record[0], list): + query_result = record[0][0] + else: + query_result = record[0] + elif isinstance(record, tuple): + query_result = record[0] + else: + query_result = record + + self.log.info("Query returns %s, type '%s'", query_result, type(query_result)) + + follow_branch = None + try: + if isinstance(query_result, bool): + if query_result: + follow_branch = self.follow_task_ids_if_true + elif isinstance(query_result, str): + # return result is not Boolean, try to convert from String to Boolean + if bool(strtobool(query_result)): + follow_branch = self.follow_task_ids_if_true + elif isinstance(query_result, int): + if bool(query_result): + follow_branch = self.follow_task_ids_if_true + elif six.PY2 and isinstance(query_result, long): # noqa + if bool(query_result): + follow_branch = self.follow_task_ids_if_true + else: + raise AirflowException( + "Unexpected query return result '%s' type '%s'" + % (query_result, type(query_result)) + ) + + if follow_branch is None: + follow_branch = self.follow_task_ids_if_false + except ValueError: + raise AirflowException( + "Unexpected query return result '%s' type '%s'" + % (query_result, type(query_result)) + ) + + self.skip_all_except(context["ti"], follow_branch) diff --git a/airflow/operators/sql_branch_operator.py b/airflow/operators/sql_branch_operator.py new file mode 100644 index 0000000000000..b911e345576f7 --- /dev/null +++ b/airflow/operators/sql_branch_operator.py @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""This module is deprecated. Please use `airflow.operators.sql`.""" +import warnings + +from airflow.operators.sql import BranchSQLOperator + + +class BranchSqlOperator(BranchSQLOperator): + """ + This class is deprecated. + Please use `airflow.operators.sql.BranchSQLOperator`. + """ + + def __init__(self, *args, **kwargs): + warnings.warn( + """This class is deprecated. + Please use `airflow.operators.sql.BranchSQLOperator`.""", + DeprecationWarning, stacklevel=2 + ) + super(BranchSqlOperator, self).__init__(*args, **kwargs) diff --git a/airflow/plugins_manager.py b/airflow/plugins_manager.py index e97c0fd395352..e3cc79db2281a 100644 --- a/airflow/plugins_manager.py +++ b/airflow/plugins_manager.py @@ -25,26 +25,52 @@ from builtins import object import imp import inspect +import logging import os import re -from typing import Any, Dict, List, Set, Type +import sys +import warnings +from typing import Any, Dict, List, Type -import pkg_resources +from six import with_metaclass + +try: + import importlib_metadata +except ImportError: + import importlib.metadata as importlib_metadata from airflow import settings from airflow.models.baseoperator import BaseOperatorLink -from airflow.utils.log.logging_mixin import LoggingMixin -log = LoggingMixin().log +log = logging.getLogger(__name__) import_errors = {} +PY37 = sys.version_info >= (3, 7) + + class AirflowPluginException(Exception): pass -class AirflowPlugin(object): +class _MetaPluginClass(type): + def __new__(cls, name, bases, props): + if props.get('operators', []) or props.get('sensors', []): + warnings.warn( + "Registering operators or sensors in plugins is deprecated -- these should be treated like " + "'plain' python modules, and imported normally in DAGs.\n" + "\n" + "Airflow 2.0 has removed the ability to register these types in plugins. See " + ".", + category=FutureWarning, + stacklevel=2, + ) + + return super(_MetaPluginClass, cls).__new__(cls, name, bases, props) + + +class AirflowPlugin(with_metaclass(_MetaPluginClass, object)): name = None # type: str operators = [] # type: List[Any] sensors = [] # type: List[Any] @@ -87,6 +113,23 @@ def on_load(cls, *args, **kwargs): """ +def entry_points_with_dist(group): + """ + Return EntryPoint objects of the given group, along with the distribution information. + + This is like the ``entry_points()`` function from importlib.metadata, + except it also returns the distribution the entry_point was loaded from. + + :param group: FIlter results to only this entrypoint group + :return: Generator of (EntryPoint, Distribution) objects for the specified groups + """ + for dist in importlib_metadata.distributions(): + for e in dist.entry_points: + if e.group != group: + continue + yield (e, dist) + + def load_entrypoint_plugins(entry_points, airflow_plugins): """ Load AirflowPlugin subclasses from the entrypoints @@ -99,43 +142,25 @@ def load_entrypoint_plugins(entry_points, airflow_plugins): :type airflow_plugins: list[type[airflow.plugins_manager.AirflowPlugin]] :rtype: list[airflow.plugins_manager.AirflowPlugin] """ - for entry_point in entry_points: + global import_errors # pylint: disable=global-statement + for entry_point, dist in entry_points: log.debug('Importing entry_point plugin %s', entry_point.name) - plugin_obj = entry_point.load() - if is_valid_plugin(plugin_obj, airflow_plugins): + try: + plugin_obj = entry_point.load() + plugin_obj.__usable_import_name = entry_point.module + if not is_valid_plugin(plugin_obj, airflow_plugins): + continue + if callable(getattr(plugin_obj, 'on_load', None)): plugin_obj.on_load() + airflow_plugins.append(plugin_obj) + except Exception as e: # pylint: disable=broad-except + log.exception("Failed to import plugin %s", entry_point.name) + import_errors[entry_point.module] = str(e) return airflow_plugins -def register_inbuilt_operator_links(): - """ - Register all the Operators Links that are already defined for the operators - in the "airflow" project. Example: QDSLink (Operator Link for Qubole Operator) - - This is required to populate the "whitelist" of allowed classes when deserializing operator links - """ - inbuilt_operator_links = set() # type: Set[Type] - - try: - from airflow.contrib.operators.bigquery_operator import BigQueryConsoleLink, BigQueryConsoleIndexableLink # noqa E501 # pylint: disable=R0401,line-too-long - inbuilt_operator_links.update([BigQueryConsoleLink, BigQueryConsoleIndexableLink]) - except ImportError: - pass - - try: - from airflow.contrib.operators.qubole_operator import QDSLink # pylint: disable=R0401 - inbuilt_operator_links.update([QDSLink]) - except ImportError: - pass - - registered_operator_link_classes.update({ - "{}.{}".format(link.__module__, link.__name__): link - for link in inbuilt_operator_links - }) - - def is_valid_plugin(plugin_obj, existing_plugins): """ Check whether a potential object is a subclass of @@ -176,12 +201,24 @@ def is_valid_plugin(plugin_obj, existing_plugins): continue log.debug('Importing plugin module %s', filepath) + + if mod_name == "__init__": + compat_import_name = root + else: + compat_import_name = os.path.join(root, mod_name) + + compat_import_name = os.path.relpath( + compat_import_name, + settings.PLUGINS_FOLDER, + ).replace(os.sep, '.') + # normalize root path as namespace namespace = '_'.join([re.sub(norm_pattern, '__', root), mod_name]) m = imp.load_source(namespace, filepath) for obj in list(m.__dict__.values()): if is_valid_plugin(obj, plugins): + obj.__usable_import_name = compat_import_name plugins.append(obj) # 插件的on_load没有被调用,在2.0中已修复,此处对当前版本做临时修复 # https://github.com/apache/airflow/issues/10868 @@ -193,11 +230,61 @@ def is_valid_plugin(plugin_obj, existing_plugins): import_errors[filepath] = str(e) plugins = load_entrypoint_plugins( - pkg_resources.iter_entry_points('airflow.plugins'), + entry_points_with_dist('airflow.plugins'), plugins ) +def make_deprecated_module(kind, plugin, objects=None): + name = 'airflow.{}.{}'.format(kind, plugin.name) + module = imp.new_module(name) + module._name = name.split('.')[-1] + if objects is None: + objects = getattr(plugin, kind) + module._objects = objects + objects = {o.__name__: o for o in objects} + + def __getattr__(attrname): + """Get attribute.""" + if attrname not in objects: + raise AttributeError("module '{}' has no attribute '{}'".format(name, attrname)) + + stacklevel = 2 if PY37 else 3 + + obj = objects[attrname] + # Use __qualname__ where we have it for Py 3.3+ + obj_name = getattr(obj, '__qualname__', obj.__name__) + + # Work out what the "correct" import name should be + if obj.__module__ == plugin.__module__: + # Class is defined in the plugin + correct_import_name = '.'.join((plugin.__usable_import_name, obj_name)) + else: + # Class was imported from somewhere else, just direct user to use that instead + correct_import_name = '.'.join((obj.__module__, obj_name)) + + warnings.warn( + "Importing '{}' from under 'airflow.{}.*' has been deprecated and should be directly " + "imported as '{}' instead.\n" + "\n" + "Support for importing from within the airflow namespace for plugins will be dropped entirely " + "in Airflow 2.0. See .".format( + attrname, kind, correct_import_name + ), + category=FutureWarning, + stacklevel=stacklevel + ) + return obj + + def __dir__(): + return objects.keys() + + module.__getattr__ = __getattr__ + module.__dir__ = __dir__ + + return module + + def make_module(name, objects): log.debug('Creating module %s', name) name = name.lower() @@ -233,11 +320,13 @@ def make_module(name, objects): for p in plugins: operators_modules.append( - make_module('airflow.operators.' + p.name, p.operators + p.sensors)) + make_deprecated_module('operators', p, p.operators + p.sensors)) sensors_modules.append( - make_module('airflow.sensors.' + p.name, p.sensors) + make_deprecated_module('sensors', p) + ) + hooks_modules.append( + make_deprecated_module('hooks', p) ) - hooks_modules.append(make_module('airflow.hooks.' + p.name, p.hooks)) executors_modules.append( make_module('airflow.executors.' + p.name, p.executors)) macros_modules.append(make_module('airflow.macros.' + p.name, p.macros)) diff --git a/airflow/secrets/__init__.py b/airflow/secrets/__init__.py index 57dde4b888ca6..8736ac678de4a 100644 --- a/airflow/secrets/__init__.py +++ b/airflow/secrets/__init__.py @@ -22,17 +22,20 @@ * Metatsore database * AWS SSM Parameter store """ -__all__ = ['BaseSecretsBackend', 'get_connections', 'get_variable'] +__all__ = ['BaseSecretsBackend', 'get_connections', 'get_variable', 'get_custom_secret_backend'] import json -from typing import List, Optional +from typing import TYPE_CHECKING, List, Optional from airflow.configuration import conf from airflow.exceptions import AirflowException -from airflow.models.connection import Connection from airflow.secrets.base_secrets import BaseSecretsBackend from airflow.utils.module_loading import import_string +if TYPE_CHECKING: + from airflow.models.connection import Connection + + CONFIG_SECTION = "secrets" DEFAULT_SECRETS_SEARCH_PATH = [ "airflow.secrets.environment_variables.EnvironmentVariablesBackend", @@ -41,7 +44,7 @@ def get_connections(conn_id): - # type: (str) -> List[Connection] + # type: (str) -> List['Connection'] """ Get all connections as an iterable. @@ -72,25 +75,35 @@ def get_variable(key): return None +def get_custom_secret_backend(): + # type: (...) -> Optional[BaseSecretsBackend] + """Get Secret Backend if defined in airflow.cfg""" + alternative_secrets_backend = conf.get(section=CONFIG_SECTION, key='backend', fallback='') + + if alternative_secrets_backend: + try: + alternative_secrets_config_dict = json.loads( + conf.get(section=CONFIG_SECTION, key='backend_kwargs', fallback='{}') + ) + except ValueError: + alternative_secrets_config_dict = {} + secrets_backend_cls = import_string(alternative_secrets_backend) + return secrets_backend_cls(**alternative_secrets_config_dict) + return None + + def initialize_secrets_backends(): # type: (...) -> List[BaseSecretsBackend] """ * import secrets backend classes * instantiate them and return them in a list """ - alternative_secrets_backend = conf.get(section=CONFIG_SECTION, key='backend', fallback='') - try: - alternative_secrets_config_dict = json.loads( - conf.get(section=CONFIG_SECTION, key='backend_kwargs', fallback='{}') - ) - except ValueError: - alternative_secrets_config_dict = {} - backend_list = [] - if alternative_secrets_backend: - secrets_backend_cls = import_string(alternative_secrets_backend) - backend_list.append(secrets_backend_cls(**alternative_secrets_config_dict)) + custom_secret_backend = get_custom_secret_backend() + + if custom_secret_backend is not None: + backend_list.append(custom_secret_backend) for class_name in DEFAULT_SECRETS_SEARCH_PATH: secrets_backend_cls = import_string(class_name) diff --git a/airflow/secrets/base_secrets.py b/airflow/secrets/base_secrets.py index 2394f407e7e78..a8c0e6b0204c6 100644 --- a/airflow/secrets/base_secrets.py +++ b/airflow/secrets/base_secrets.py @@ -16,9 +16,7 @@ # under the License. from abc import ABCMeta -from typing import List, Optional - -from airflow.models.connection import Connection +from typing import Optional class BaseSecretsBackend: @@ -56,13 +54,13 @@ def get_conn_uri(self, conn_id): raise NotImplementedError() def get_connections(self, conn_id): - # type: (str) -> List[Connection] """ Get connections with a specific ID :param conn_id: connection id :type conn_id: str """ + from airflow.models.connection import Connection conn_uri = self.get_conn_uri(conn_id=conn_id) if not conn_uri: return [] @@ -78,3 +76,13 @@ def get_variable(self, key): :return: Variable Value """ raise NotImplementedError() + + def get_config(self, key): # pylint: disable=unused-argument + # type: (str) -> Optional[str] + """ + Return value for Airflow Config Key + + :param key: Config Key + :return: Config Value + """ + return None diff --git a/airflow/secrets/local_filesystem.py b/airflow/secrets/local_filesystem.py new file mode 100644 index 0000000000000..1d97979ca4825 --- /dev/null +++ b/airflow/secrets/local_filesystem.py @@ -0,0 +1,268 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Objects relating to retrieving connections and variables from local file +""" +import json +import logging +import os +from collections import defaultdict + +import funcsigs +import six + +from airflow.exceptions import AirflowException, AirflowFileParseException, file_syntax_error +from airflow.secrets.base_secrets import BaseSecretsBackend +from airflow.utils.file import COMMENT_PATTERN +from airflow.utils.log.logging_mixin import LoggingMixin + +log = logging.getLogger(__name__) + + +def get_connection_parameter_names(): + """Returns :class:`airflow.models.connection.Connection` constructor parameters.""" + from airflow.models.connection import Connection + + return {k for k in funcsigs.signature(Connection.__init__).parameters.keys() if k != "self"} + + +def _parse_env_file(file_path): + """ + Parse a file in the ``.env '' format. + + .. code-block:: text + + MY_CONN_ID=my-conn-type://my-login:my-pa%2Fssword@my-host:5432/my-schema?param1=val1¶m2=val2 + + :param file_path: The location of the file that will be processed. + :type file_path: str + :return: Tuple with mapping of key and list of values and list of syntax errors + """ + with open(file_path) as f: + content = f.read() + + secrets = defaultdict(list) + errors = [] + for line_no, line in enumerate(content.splitlines(), 1): + if not line: + # Ignore empty line + continue + + if COMMENT_PATTERN.match(line): + # Ignore comments + continue + + var_parts = line.split("=", 2) + if len(var_parts) != 2: + errors.append( + file_syntax_error( + line_no=line_no, + message='Invalid line format. The line should contain at least one equal sign ("=").', + ) + ) + continue + + key, value = var_parts + if not key: + errors.append(file_syntax_error(line_no=line_no, message="Invalid line format. Key is empty.",)) + secrets[key].append(value) + return secrets, errors + + +def _parse_json_file(file_path): + """ + Parse a file in the JSON format. + + :param file_path: The location of the file that will be processed. + :type file_path: str + :return: Tuple with mapping of key and list of values and list of syntax errors + """ + with open(file_path) as f: + content = f.read() + + if not content: + return {}, [file_syntax_error(line_no=1, message="The file is empty.")] + try: + secrets = json.loads(content) + except ValueError as e: + return {}, [file_syntax_error(line_no=1, message=str(e))] + if not isinstance(secrets, dict): + return {}, [file_syntax_error(line_no=1, message="The file should contain the object.")] + + return secrets, [] + + +FILE_PARSERS = { + "env": _parse_env_file, + "json": _parse_json_file, +} + + +def _parse_secret_file(file_path): + """ + Based on the file extension format, selects a parser, and parses the file. + + :param file_path: The location of the file that will be processed. + :type file_path: str + :return: Map of secret key (e.g. connection ID) and value. + """ + if not os.path.exists(file_path): + raise AirflowException( + "File {} was not found. Check the configuration of your Secrets backend.".format(file_path) + ) + + log.debug("Parsing file: %s", file_path) + + ext = file_path.rsplit(".", 2)[-1].lower() + + if ext not in FILE_PARSERS: + raise AirflowException("Unsupported file format. The file must have the extension .env or .json") + + secrets, parse_errors = FILE_PARSERS[ext](file_path) + + log.debug("Parsed file: len(parse_errors)=%d, len(secrets)=%d", len(parse_errors), len(secrets)) + + if parse_errors: + raise AirflowFileParseException( + "Failed to load the secret file.", file_path=file_path, parse_errors=parse_errors + ) + + return secrets + + +def _create_connection(conn_id, value): + """ + Creates a connection based on a URL or JSON object. + """ + from airflow.models.connection import Connection + + if isinstance(value, six.string_types): + return Connection(conn_id=conn_id, uri=value) + if isinstance(value, dict): + connection_parameter_names = get_connection_parameter_names() + current_keys = set(value.keys()) + if not current_keys.issubset(connection_parameter_names): + illegal_keys = current_keys - connection_parameter_names + illegal_keys_list = ", ".join(illegal_keys) + raise AirflowException( + "The object have illegal keys: {}." + "The dictionary can only contain the following keys: {}".format( + illegal_keys_list, connection_parameter_names + ) + ) + + if "conn_id" in current_keys and conn_id != value["conn_id"]: + raise AirflowException( + "Mismatch conn_id. " + "The dictionary key has the value: " + value['conn_id'] + ". " + "The item has the value: " + conn_id + " ." + ) + value["conn_id"] = conn_id + return Connection(**value) + raise AirflowException( + "Unexpected value type: {}" + ". The connection can only be defined using a string or object.".format(type(value)) + ) + + +def load_variables(file_path): + """ + Load variables from a text file. + Both ``JSON`` and ``.env`` files are supported. + + :param file_path: The location of the file that will be processed. + :type file_path: str + :rtype: dict[str, list[str]] + """ + log.debug("Loading variables from a text file") + + secrets = _parse_secret_file(file_path) + invalid_keys = [key for key, values in secrets.items() if isinstance(values, list) and len(values) != 1] + if invalid_keys: + raise AirflowException( + 'The "{}" file contains multiple values for keys: {}'.format(file_path, invalid_keys) + ) + variables = {key: values[0] if isinstance(values, list) else values for key, values in secrets.items()} + log.debug("Loaded %d variables: ", len(variables)) + return variables + + +def load_connections(file_path): + """ + Load connection from text file. + Both ``JSON`` and ``.env`` files are supported. + + :return: A dictionary where the key contains a connection ID and the value contains a list of connections. + :rtype: list[str, list[airflow.models.connection.Connection]] + """ + log.debug("Loading connection") + + secrets = _parse_secret_file(file_path) + connections_by_conn_id = defaultdict(list) + for key, secret_values in list(secrets.items()): + if isinstance(secret_values, list): + for secret_value in secret_values: + connections_by_conn_id[key].append(_create_connection(key, secret_value)) + else: + connections_by_conn_id[key].append(_create_connection(key, secret_values)) + num_conn = sum(map(len, connections_by_conn_id.values())) + log.debug("Loaded %d connections", num_conn) + + return connections_by_conn_id + + +class LocalFilesystemBackend(BaseSecretsBackend, LoggingMixin): + """ + Retrieves Connection objects and Variables from local files + Both ``JSON`` and ``.env`` files are supported. + + :param variables_file_path: File location with variables data. + :type variables_file_path: str + :param connections_file_path: File location with connection data. + :type connections_file_path: str + """ + + def __init__( + self, variables_file_path=None, connections_file_path=None + ): + super(LocalFilesystemBackend, self).__init__() + self.variables_file = variables_file_path + self.connections_file = connections_file_path + + @property + def _local_variables(self): + if not self.variables_file: + self.log.debug("The file for variables is not specified. Skipping") + # The user may not specify any file. + return {} + secrets = load_variables(self.variables_file) + return secrets + + @property + def _local_connections(self): + if not self.connections_file: + self.log.debug("The file for connection is not specified. Skipping") + # The user may not specify any file. + return {} + return load_connections(self.connections_file) + + def get_connections(self, conn_id): + return self._local_connections.get(conn_id) or [] + + def get_variable(self, key): + return self._local_variables.get(key) diff --git a/airflow/secrets/metastore.py b/airflow/secrets/metastore.py index 51d8740cda7f3..f1412e9b1fa09 100644 --- a/airflow/secrets/metastore.py +++ b/airflow/secrets/metastore.py @@ -19,9 +19,6 @@ Objects relating to sourcing connections from metastore database """ -from typing import List - -from airflow.models.connection import Connection from airflow.secrets import BaseSecretsBackend from airflow.utils.db import provide_session @@ -34,7 +31,7 @@ class MetastoreBackend(BaseSecretsBackend): # pylint: disable=missing-docstring @provide_session def get_connections(self, conn_id, session=None): - # type: (...) -> List[Connection] + from airflow.models.connection import Connection conn_list = session.query(Connection).filter(Connection.conn_id == conn_id).all() session.expunge_all() return conn_list diff --git a/airflow/security/kerberos.py b/airflow/security/kerberos.py index 35fb671d70195..dfde1123ac132 100644 --- a/airflow/security/kerberos.py +++ b/airflow/security/kerberos.py @@ -32,18 +32,17 @@ # See the License for the specific language governing permissions and # limitations under the License. """Kerberos security provider""" - +import logging import socket import subprocess import sys import time -from airflow import LoggingMixin from airflow.configuration import conf NEED_KRB181_WORKAROUND = None -log = LoggingMixin().log +log = logging.getLogger(__name__) def renew_from_kt(principal, keytab): @@ -83,7 +82,9 @@ def renew_from_kt(principal, keytab): if subp.returncode != 0: log.error( "Couldn't reinit from keytab! `kinit' exited with %s.\n%s\n%s", - subp.returncode, "\n".join(subp.stdout.readlines()), "\n".join(subp.stderr.readlines()) + subp.returncode, + "\n".join(subp.stdout.readlines() if subp.stdout else []), + "\n".join(subp.stderr.readlines() if subp.stderr else []) ) sys.exit(subp.returncode) diff --git a/airflow/sensors/__init__.py b/airflow/sensors/__init__.py index b9d1dbb03de6d..449ed3163cfe7 100644 --- a/airflow/sensors/__init__.py +++ b/airflow/sensors/__init__.py @@ -20,6 +20,9 @@ import sys import os as _os + +PY37 = sys.version_info >= (3, 7) + _sensors = { 'base_sensor_operator': ['BaseSensorOperator'], 'external_task_sensor': ['ExternalTaskSensor'], @@ -46,6 +49,10 @@ def _integrate_plugins(): from airflow.plugins_manager import sensors_modules for sensors_module in sensors_modules: sys.modules[sensors_module.__name__] = sensors_module + + if not PY37: + from pep562 import Pep562 + sensors_module = Pep562(sensors_module.__name__) globals()[sensors_module._name] = sensors_module ########################################################## diff --git a/airflow/sensors/date_time_sensor.py b/airflow/sensors/date_time_sensor.py new file mode 100644 index 0000000000000..a62f3cb8c97b2 --- /dev/null +++ b/airflow/sensors/date_time_sensor.py @@ -0,0 +1,76 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime + +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils import timezone +from airflow.utils.decorators import apply_defaults + + +class DateTimeSensor(BaseSensorOperator): + """ + Waits until the specified datetime. + + A major advantage of this sensor is idempotence for the ``target_time``. + It handles some cases for which ``TimeSensor`` and ``TimeDeltaSensor`` are not suited. + + **Example** 1 : + If a task needs to wait for 11am on each ``execution_date``. Using + ``TimeSensor`` or ``TimeDeltaSensor``, all backfill tasks started at + 1am have to wait for 10 hours. This is unnecessary, e.g. a backfill + task with ``{{ ds }} = '1970-01-01'`` does not need to wait because + ``1970-01-01T11:00:00`` has already passed. + + **Example** 2 : + If a DAG is scheduled to run at 23:00 daily, but one of the tasks is + required to run at 01:00 next day, using ``TimeSensor`` will return + ``True`` immediately because 23:00 > 01:00. Instead, we can do this: + + .. code-block:: python + + DateTimeSensor( + task_id='wait_for_0100', + target_time='{{ next_execution_date.tomorrow().replace(hour=1) }}', + ) + + :param target_time: datetime after which the job succeeds. (templated) + :type target_time: str or datetime.datetime + """ + + template_fields = ("target_time",) + + @apply_defaults + def __init__( + self, target_time, *args, **kwargs + ): + super(DateTimeSensor, self).__init__(*args, **kwargs) + if isinstance(target_time, datetime.datetime): + self.target_time = target_time.isoformat() + elif isinstance(target_time, str): + self.target_time = target_time + else: + raise TypeError( + "Expected str or datetime.datetime type for target_time. Got {}".format( + type(target_time) + ) + ) + + def poke(self, context): + self.log.info("Checking if the time (%s) has come", self.target_time) + return timezone.utcnow() > timezone.parse(self.target_time) diff --git a/airflow/sensors/external_task_sensor.py b/airflow/sensors/external_task_sensor.py index 1b93528ad32b1..b759a71d58f23 100644 --- a/airflow/sensors/external_task_sensor.py +++ b/airflow/sensors/external_task_sensor.py @@ -104,7 +104,7 @@ def poke(self, context, session=None): if self.execution_delta: dttm = context['execution_date'] - self.execution_delta elif self.execution_date_fn: - dttm = self.execution_date_fn(context['execution_date']) + dttm = self._handle_execution_date_fn(context=context) else: dttm = context['execution_date'] @@ -159,6 +159,26 @@ def poke(self, context, session=None): session.commit() return count == len(dttm_filter) + def _handle_execution_date_fn(self, context): + """ + This function is to handle backwards compatibility with how this operator was + previously where it only passes the execution date, but also allow for the newer + implementation to pass all context through as well, to allow for more sophisticated + returns of dates to return. + Namely, this function check the number of arguments in the execution_date_fn + signature and if its 1, treat the legacy way, if it's 2, pass the context as + the 2nd argument, and if its more, throw an exception. + """ + num_fxn_params = self.execution_date_fn.__code__.co_argcount + if num_fxn_params == 1: + return self.execution_date_fn(context['execution_date']) + elif num_fxn_params == 2: + return self.execution_date_fn(context['execution_date'], context) + else: + raise AirflowException( + 'execution_date_fn passed {} args but only allowed up to 2'.format(num_fxn_params) + ) + class ExternalTaskMarker(DummyOperator): """ @@ -181,6 +201,9 @@ class ExternalTaskMarker(DummyOperator): template_fields = ['external_dag_id', 'external_task_id', 'execution_date'] ui_color = '#19647e' + # The _serialized_fields are lazily loaded when get_serialized_fields() method is called + __serialized_fields = None + @apply_defaults def __init__(self, external_dag_id, @@ -202,3 +225,14 @@ def __init__(self, if recursion_depth <= 0: raise ValueError("recursion_depth should be a positive integer") self.recursion_depth = recursion_depth + + @classmethod + def get_serialized_fields(cls): + """Serialized ExternalTaskMarker contain exactly these fields + templated_fields .""" + if not cls.__serialized_fields: + cls.__serialized_fields = frozenset( + super(ExternalTaskMarker, cls).get_serialized_fields() | { + "recursion_depth" + } + ) + return cls.__serialized_fields diff --git a/airflow/sensors/hdfs_sensor.py b/airflow/sensors/hdfs_sensor.py index bc3a3c63fd64e..f18718474029e 100644 --- a/airflow/sensors/hdfs_sensor.py +++ b/airflow/sensors/hdfs_sensor.py @@ -16,7 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +import logging import re import sys from builtins import str @@ -25,7 +25,9 @@ from airflow.hooks.hdfs_hook import HDFSHook from airflow.sensors.base_sensor_operator import BaseSensorOperator from airflow.utils.decorators import apply_defaults -from airflow.utils.log.logging_mixin import LoggingMixin + + +log = logging.getLogger(__name__) class HdfsSensor(BaseSensorOperator): @@ -65,7 +67,6 @@ def filter_for_filesize(result, size=None): :return: (bool) depending on the matching criteria """ if size: - log = LoggingMixin().log log.debug( 'Filtering for file size >= %s in files: %s', size, map(lambda x: x['path'], result) @@ -90,7 +91,6 @@ def filter_for_ignored_ext(result, ignored_ext, ignore_copying): :rtype: list[dict] """ if ignore_copying: - log = LoggingMixin().log regex_builder = r"^.*\.(%s$)$" % '$|'.join(ignored_ext) ignored_extensions_regex = re.compile(regex_builder) log.debug( diff --git a/airflow/sensors/sql_sensor.py b/airflow/sensors/sql_sensor.py index 54b29824c76e0..47a375cae5e9b 100644 --- a/airflow/sensors/sql_sensor.py +++ b/airflow/sensors/sql_sensor.py @@ -77,7 +77,7 @@ def _get_hook(self): allowed_conn_type = {'google_cloud_platform', 'jdbc', 'mssql', 'mysql', 'oracle', 'postgres', - 'presto', 'sqlite', 'vertica'} + 'presto', 'snowflake', 'sqlite', 'vertica'} if conn.conn_type not in allowed_conn_type: raise AirflowException("The connection type is not supported by SqlSensor. " + "Supported connection types: {}".format(list(allowed_conn_type))) diff --git a/airflow/sensors/time_sensor.py b/airflow/sensors/time_sensor.py index 0c392354532a6..5c41c2cf752ad 100644 --- a/airflow/sensors/time_sensor.py +++ b/airflow/sensors/time_sensor.py @@ -37,4 +37,4 @@ def __init__(self, target_time, *args, **kwargs): def poke(self, context): self.log.info('Checking if the time (%s) has come', self.target_time) - return timezone.utcnow().time() > self.target_time + return timezone.make_naive(timezone.utcnow(), self.dag.timezone).time() > self.target_time diff --git a/airflow/sentry.py b/airflow/sentry.py index 7d22aec65d3c7..ff8596915c74f 100644 --- a/airflow/sentry.py +++ b/airflow/sentry.py @@ -18,17 +18,14 @@ # under the License. """Sentry Integration""" - - +import logging from functools import wraps from airflow.configuration import conf from airflow.utils.db import provide_session -from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.state import State - -log = LoggingMixin().log +log = logging.getLogger(__name__) class DummySentry: @@ -55,6 +52,11 @@ def enrich_errors(cls, run): """ return run + def flush(self): + """ + Blank function for flushing errors. + """ + class ConfiguredSentry(DummySentry): """ @@ -150,6 +152,10 @@ def wrapper(task_instance, session=None, *args, **kwargs): return wrapper + def flush(self): + import sentry_sdk + sentry_sdk.flush() + Sentry = DummySentry() # type: DummySentry diff --git a/airflow/serialization/json_schema.py b/airflow/serialization/json_schema.py index e33ce8cdc9491..3ea56af91fca8 100644 --- a/airflow/serialization/json_schema.py +++ b/airflow/serialization/json_schema.py @@ -23,10 +23,10 @@ from typing import Iterable import jsonschema -from typing_extensions import Protocol from airflow.exceptions import AirflowException from airflow.settings import json +from airflow.typing_compat import Protocol class Validator(Protocol): diff --git a/airflow/serialization/serialized_objects.py b/airflow/serialization/serialized_objects.py index 3d2029ac866b0..c527ddfccf8e2 100644 --- a/airflow/serialization/serialized_objects.py +++ b/airflow/serialization/serialized_objects.py @@ -23,19 +23,20 @@ import enum import logging import six -from typing import TYPE_CHECKING, Optional, Union, Dict +from typing import TYPE_CHECKING, Optional, Union, Dict, List import cattr import pendulum from dateutil import relativedelta -from airflow import DAG, AirflowException, LoggingMixin +from airflow import DAG, AirflowException from airflow.models.baseoperator import BaseOperator, BaseOperatorLink from airflow.models.connection import Connection from airflow.serialization.enums import DagAttributeTypes as DAT, Encoding from airflow.serialization.helpers import serialize_template_field from airflow.serialization.json_schema import Validator, load_dag_schema from airflow.settings import json +from airflow.utils.module_loading import import_string from airflow.www.utils import get_python_source try: @@ -46,6 +47,19 @@ if TYPE_CHECKING: from inspect import Parameter +log = logging.getLogger(__name__) + + +BUILTIN_OPERATOR_EXTRA_LINKS = [ + "airflow.contrib.operators.bigquery_operator.BigQueryConsoleLink", + "airflow.contrib.operators.bigquery_operator.BigQueryConsoleIndexableLink", + "airflow.contrib.operators.qubole_operator.QDSLink", + # providers new paths + "airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleLink", + "airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleIndexableLink", + "airflow.providers.qubole.operators.qubole.QDSLink" +] # type: List[str] + class BaseSerialization: """BaseSerialization provides utils for serialization.""" @@ -122,10 +136,16 @@ def _is_primitive(cls, var): @classmethod def _is_excluded(cls, var, attrname, instance): """Types excluded from serialization.""" + + if var is None: + if not cls._is_constructor_param(attrname, instance): + # Any instance attribute, that is not a constructor argument, we exclude None as the default + return True + + return cls._value_is_hardcoded_default(attrname, var, instance) return ( - var is None or isinstance(var, cls._excluded_types) or - cls._value_is_hardcoded_default(attrname, var) + cls._value_is_hardcoded_default(attrname, var, instance) ) @classmethod @@ -206,10 +226,10 @@ def _serialize(cls, var): # pylint: disable=too-many-return-statements return cls._encode( [cls._serialize(v) for v in var], type_=DAT.TUPLE) else: - LOG.debug('Cast type %s to str in serialization.', type(var)) + log.debug('Cast type %s to str in serialization.', type(var)) return str(var) except Exception: # pylint: disable=broad-except - LOG.warning('Failed to stringify.', exc_info=True) + log.warning('Failed to stringify.', exc_info=True) return FAILED @classmethod @@ -259,7 +279,12 @@ def _deserialize_timedelta(cls, seconds): return datetime.timedelta(seconds=seconds) @classmethod - def _value_is_hardcoded_default(cls, attrname, value): + def _is_constructor_param(cls, attrname, instance): + # pylint: disable=unused-argument + return attrname in cls._CONSTRUCTOR_PARAMS + + @classmethod + def _value_is_hardcoded_default(cls, attrname, value, instance): """ Return true if ``value`` is the hard-coded default for the given attribute. This takes in to account cases where the ``concurrency`` parameter is @@ -273,8 +298,9 @@ def _value_is_hardcoded_default(cls, attrname, value): to account for the case where the default value of the field is None but has the ``field = field or {}`` set. """ + # pylint: disable=unused-argument if attrname in cls._CONSTRUCTOR_PARAMS and \ - (cls._CONSTRUCTOR_PARAMS[attrname].default is value or (value in [{}, []])): + (cls._CONSTRUCTOR_PARAMS[attrname] is value or (value in [{}, []])): return True return False @@ -288,7 +314,7 @@ class SerializedBaseOperator(BaseOperator, BaseSerialization): _decorated_fields = {'executor_config', } _CONSTRUCTOR_PARAMS = { - k: v for k, v in signature(BaseOperator).parameters.items() + k: v.default for k, v in signature(BaseOperator).parameters.items() if v.default is not v.empty } @@ -445,15 +471,17 @@ def _deserialize_operator_extra_links( # list(_operator_links_source.items())[0] = # ('airflow.gcp.operators.bigquery.BigQueryConsoleIndexableLink', {'index': 0}) - _operator_link_class, data = list(_operator_links_source.items())[0] - - if _operator_link_class in registered_operator_link_classes: - single_op_link_class_name = registered_operator_link_classes[_operator_link_class] + _operator_link_class_path, data = list(_operator_links_source.items())[0] + if _operator_link_class_path in BUILTIN_OPERATOR_EXTRA_LINKS: + single_op_link_class = import_string(_operator_link_class_path) + elif _operator_link_class_path in registered_operator_link_classes: + single_op_link_class = registered_operator_link_classes[_operator_link_class_path] else: - raise KeyError("Operator Link class %r not registered" % _operator_link_class) + log.error("Operator Link class %r not registered", _operator_link_class_path) + return {} op_predefined_extra_link = cattr.structure( - data, single_op_link_class_name) # type: BaseOperatorLink + data, single_op_link_class) # type: BaseOperatorLink op_predefined_extra_links.update( {op_predefined_extra_link.name: op_predefined_extra_link} @@ -511,7 +539,7 @@ def __get_constructor_defaults(): # pylint: disable=no-method-argument 'access_control': '_access_control', } return { - param_to_attr.get(k, k): v for k, v in signature(DAG).parameters.items() + param_to_attr.get(k, k): v.default for k, v in signature(DAG).parameters.items() if v.default is not v.empty } @@ -548,7 +576,7 @@ def deserialize_dag(cls, encoded_dag): k = "task_dict" elif k == "timezone": v = cls._deserialize_timezone(v) - elif k in {"retry_delay", "execution_timeout"}: + elif k in {"dagrun_timeout"}: v = cls._deserialize_timedelta(v) elif k.endswith("_date"): v = cls._deserialize_datetime(v) @@ -579,7 +607,7 @@ def deserialize_dag(cls, encoded_dag): for task_id in serializable_task.downstream_task_ids: # Bypass set_upstream etc here - it does more than we want # noinspection PyProtectedMember - dag.task_dict[task_id]._upstream_task_ids.add(task_id) # pylint: disable=protected-access + dag.task_dict[task_id]._upstream_task_ids.add(serializable_task.task_id) # noqa: E501 # pylint: disable=protected-access return dag @@ -607,7 +635,5 @@ def from_dict(cls, serialized_obj): return cls.deserialize_dag(serialized_obj['dag']) -LOG = LoggingMixin().log - # Serialization failure returns 'failed'. FAILED = 'serialization_failed' diff --git a/airflow/settings.py b/airflow/settings.py index 6f82e5d904cd4..c708d9039de24 100644 --- a/airflow/settings.py +++ b/airflow/settings.py @@ -26,10 +26,11 @@ import json import logging import os -import pendulum import sys +import warnings from typing import Any +import pendulum from sqlalchemy import create_engine, exc from sqlalchemy.orm import scoped_session, sessionmaker from sqlalchemy.pool import NullPool @@ -153,37 +154,57 @@ def timing(self, stat, dt): # The JSON library to use for DAG Serialization and De-Serialization json = json - -def policy(task_instance): +# Dictionary containing State and colors associated to each state to +# display on the Webserver +STATE_COLORS = { + "queued": "gray", + "running": "lime", + "success": "green", + "failed": "red", + "up_for_retry": "gold", + "up_for_reschedule": "turquoise", + "upstream_failed": "orange", + "skipped": "pink", + "scheduled": "tan", +} + + +def policy(task): """ - This policy setting allows altering task instances right before they - are executed. It allows administrator to rewire some task parameters. - - Note that the ``TaskInstance`` object has an attribute ``task`` pointing - to its related task object, that in turns has a reference to the DAG - object. So you can use the attributes of all of these to define your - policy. + This policy setting allows altering tasks after they are loaded in + the DagBag. It allows administrator to rewire some task parameters. To define policy, add a ``airflow_local_settings`` module - to your PYTHONPATH that defines this ``policy`` function. It receives - a ``TaskInstance`` object and can alter it where needed. + to your PYTHONPATH that defines this ``policy`` function. Here are a few examples of how this can be useful: * You could enforce a specific queue (say the ``spark`` queue) for tasks using the ``SparkOperator`` to make sure that these - task instances get wired to the right workers - * You could force all task instances running on an - ``execution_date`` older than a week old to run in a ``backfill`` - pool. + tasks get wired to the right workers + * You could enforce a task timeout policy, making sure that no tasks run + for more than 48 hours * ... """ +def task_instance_mutation_hook(task_instance): + """ + This setting allows altering task instances before they are queued by + the Airflow scheduler. + + To define task_instance_mutation_hook, add a ``airflow_local_settings`` module + to your PYTHONPATH that defines this ``task_instance_mutation_hook`` function. + + This could be used, for instance, to modify the task instance during retries. + """ + + def pod_mutation_hook(pod): """ - This setting allows altering ``Pod`` objects before they are passed to - the Kubernetes client by the ``PodLauncher`` for scheduling. + This setting allows altering ``kubernetes.client.models.V1Pod`` object + before they are passed to the Kubernetes client by the ``PodLauncher`` + for scheduling. To define a pod mutation hook, add a ``airflow_local_settings`` module to your PYTHONPATH that defines this ``pod_mutation_hook`` function. @@ -212,12 +233,38 @@ def configure_orm(disable_connection_pool=False): log.debug("Setting up DB connection pool (PID %s)" % os.getpid()) global engine global Session - engine_args = {} + engine_args = prepare_engine_args(disable_connection_pool) + + # Allow the user to specify an encoding for their DB otherwise default + # to utf-8 so jobs & users with non-latin1 characters can still use us. + engine_args['encoding'] = conf.get('core', 'SQL_ENGINE_ENCODING', fallback='utf-8') + + # For Python2 we get back a newstr and need a str + engine_args['encoding'] = engine_args['encoding'].__str__() + + if conf.has_option('core', 'sql_alchemy_connect_args'): + connect_args = conf.getimport('core', 'sql_alchemy_connect_args') + else: + connect_args = {} + + engine = create_engine(SQL_ALCHEMY_CONN, connect_args=connect_args, **engine_args) + setup_event_handlers(engine) + + Session = scoped_session(sessionmaker( + autocommit=False, + autoflush=False, + bind=engine, + expire_on_commit=False, + )) + +def prepare_engine_args(disable_connection_pool=False): + """Prepare SQLAlchemy engine args""" + engine_args = {} pool_connections = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED') if disable_connection_pool or not pool_connections: engine_args['poolclass'] = NullPool - log.debug("settings.configure_orm(): Using NullPool") + log.debug("settings.prepare_engine_args(): Using NullPool") elif 'sqlite' not in SQL_ALCHEMY_CONN: # Pool size engine args not supported by sqlite. # If no config value is defined for the pool size, select a reasonable value. @@ -249,28 +296,13 @@ def configure_orm(disable_connection_pool=False): # https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic pool_pre_ping = conf.getboolean('core', 'SQL_ALCHEMY_POOL_PRE_PING', fallback=True) - log.debug("settings.configure_orm(): Using pool settings. pool_size=%d, max_overflow=%d, " + log.debug("settings.prepare_engine_args(): Using pool settings. pool_size=%d, max_overflow=%d, " "pool_recycle=%d, pid=%d", pool_size, max_overflow, pool_recycle, os.getpid()) engine_args['pool_size'] = pool_size engine_args['pool_recycle'] = pool_recycle engine_args['pool_pre_ping'] = pool_pre_ping engine_args['max_overflow'] = max_overflow - - # Allow the user to specify an encoding for their DB otherwise default - # to utf-8 so jobs & users with non-latin1 characters can still use - # us. - engine_args['encoding'] = conf.get('core', 'SQL_ENGINE_ENCODING', fallback='utf-8') - # For Python2 we get back a newstr and need a str - engine_args['encoding'] = engine_args['encoding'].__str__() - - engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) - setup_event_handlers(engine) - - Session = scoped_session( - sessionmaker(autocommit=False, - autoflush=False, - bind=engine, - expire_on_commit=False)) + return engine_args def dispose_orm(): @@ -349,6 +381,36 @@ def prepare_syspath(): sys.path.append(PLUGINS_FOLDER) +def get_session_lifetime_config(): + """Gets session timeout configs and handles outdated configs gracefully.""" + session_lifetime_minutes = conf.get('webserver', 'session_lifetime_minutes', fallback=None) + session_lifetime_days = conf.get('webserver', 'session_lifetime_days', fallback=None) + uses_deprecated_lifetime_configs = session_lifetime_days or conf.get( + 'webserver', 'force_log_out_after', fallback=None + ) + + minutes_per_day = 24 * 60 + default_lifetime_minutes = '43200' + if uses_deprecated_lifetime_configs and session_lifetime_minutes == default_lifetime_minutes: + warnings.warn( + '`session_lifetime_days` option from `[webserver]` section has been ' + 'renamed to `session_lifetime_minutes`. The new option allows to configure ' + 'session lifetime in minutes. The `force_log_out_after` option has been removed ' + 'from `[webserver]` section. Please update your configuration.', + category=DeprecationWarning, + ) + if session_lifetime_days: + session_lifetime_minutes = minutes_per_day * int(session_lifetime_days) + + if not session_lifetime_minutes: + session_lifetime_days = 30 + session_lifetime_minutes = minutes_per_day * session_lifetime_days + + logging.debug('User session lifetime is set to %s minutes.', session_lifetime_minutes) + + return int(session_lifetime_minutes) + + def import_local_settings(): try: import airflow_local_settings @@ -399,3 +461,20 @@ def initialize(): # write rate. MIN_SERIALIZED_DAG_UPDATE_INTERVAL = conf.getint( 'core', 'min_serialized_dag_update_interval', fallback=30) + +# Fetching serialized DAG can not be faster than a minimum interval to reduce database +# read rate. This config controls when your DAGs are updated in the Webserver +MIN_SERIALIZED_DAG_FETCH_INTERVAL = conf.getint( + 'core', 'min_serialized_dag_fetch_interval', fallback=10) + +# Whether to persist DAG files code in DB. If set to True, Webserver reads file contents +# from DB instead of trying to access files in a DAG folder. +# Defaults to same as the store_serialized_dags setting. +STORE_DAG_CODE = conf.getboolean("core", "store_dag_code", fallback=STORE_SERIALIZED_DAGS) + +# If donot_modify_handlers=True, we do not modify logging handlers in task_run command +# If the flag is set to False, we remove all handlers from the root logger +# and add all handlers from 'airflow.task' logger to the root Logger. This is done +# to get all the logs from the print & log statements in the DAG files before a task is run +# The handlers are restored after the task completes execution. +DONOT_MODIFY_HANDLERS = conf.getboolean('logging', 'donot_modify_handlers', fallback=False) diff --git a/airflow/task/task_runner/standard_task_runner.py b/airflow/task/task_runner/standard_task_runner.py index 46d8e8dd0b284..8138cfa0daad0 100644 --- a/airflow/task/task_runner/standard_task_runner.py +++ b/airflow/task/task_runner/standard_task_runner.py @@ -17,6 +17,7 @@ # specific language governing permissions and limitations # under the License. +"""Standard task runner""" import os import psutil @@ -25,7 +26,7 @@ from airflow.task.task_runner.base_task_runner import BaseTaskRunner from airflow.utils.helpers import reap_process_group -CAN_FORK = hasattr(os, 'fork') +CAN_FORK = hasattr(os, "fork") class StandardTaskRunner(BaseTaskRunner): @@ -54,6 +55,7 @@ def _start_by_fork(self): return psutil.Process(pid) else: from airflow.bin.cli import get_parser + from airflow.sentry import Sentry import signal import airflow.settings as settings @@ -72,6 +74,9 @@ def _start_by_fork(self): # [1:] - remove "airflow" from the start of the command args = parser.parse_args(self._command[1:]) + self.log.info('Running: %s', self._command) + self.log.info('Job %s: Subtask %s', self._task_instance.job_id, self._task_instance.task_id) + proc_title = "airflow task runner: {0.dag_id} {0.task_id} {0.execution_date}" if hasattr(args, "job_id"): proc_title += " {0.job_id}" @@ -79,9 +84,13 @@ def _start_by_fork(self): try: args.func(args, dag=self.dag) - os._exit(0) + return_code = 0 except Exception: - os._exit(1) + return_code = 1 + finally: + # Explicitly flush any pending exception to Sentry if enabled + Sentry.flush() + os._exit(return_code) # pylint: disable=protected-access def return_code(self, timeout=0): # We call this multiple times, but we can only wait on the process once diff --git a/airflow/ti_deps/dep_context.py b/airflow/ti_deps/dep_context.py index c5d999ae33c17..74307e4b45d2c 100644 --- a/airflow/ti_deps/dep_context.py +++ b/airflow/ti_deps/dep_context.py @@ -66,6 +66,8 @@ class DepContext(object): :type ignore_task_deps: bool :param ignore_ti_state: Ignore the task instance's previous failure/success :type ignore_ti_state: bool + :param finished_tasks: A list of all the finished tasks of this run + :type finished_tasks: list[airflow.models.TaskInstance] """ def __init__( self, @@ -76,7 +78,8 @@ def __init__( ignore_in_retry_period=False, ignore_in_reschedule_period=False, ignore_task_deps=False, - ignore_ti_state=False): + ignore_ti_state=False, + finished_tasks=None): self.deps = deps or set() self.flag_upstream_failed = flag_upstream_failed self.ignore_all_deps = ignore_all_deps @@ -85,6 +88,28 @@ def __init__( self.ignore_in_reschedule_period = ignore_in_reschedule_period self.ignore_task_deps = ignore_task_deps self.ignore_ti_state = ignore_ti_state + self.finished_tasks = finished_tasks + + def ensure_finished_tasks(self, dag, execution_date, session): + """ + This method makes sure finished_tasks is populated if it's currently None. + This is for the strange feature of running tasks without dag_run. + + :param dag: The DAG for which to find finished tasks + :type dag: airflow.models.DAG + :param execution_date: The execution_date to look for + :param session: Database session to use + :return: A list of all the finished tasks of this DAG and execution_date + :rtype: list[airflow.models.TaskInstance] + """ + if self.finished_tasks is None: + self.finished_tasks = dag.get_task_instances( + start_date=execution_date, + end_date=execution_date, + state=State.finished() + [State.UPSTREAM_FAILED], + session=session, + ) + return self.finished_tasks # In order to be able to get queued a task must have one of these states diff --git a/airflow/ti_deps/deps/dagrun_id_dep.py b/airflow/ti_deps/deps/dagrun_id_dep.py index 641fe84131858..56bba33c9d5c1 100644 --- a/airflow/ti_deps/deps/dagrun_id_dep.py +++ b/airflow/ti_deps/deps/dagrun_id_dep.py @@ -48,9 +48,9 @@ def _get_dep_statuses(self, ti, session, dep_context=None): from airflow.jobs import BackfillJob # To avoid a circular dependency dagrun = ti.get_dagrun(session) - if not dagrun.run_id or not match(BackfillJob.ID_PREFIX + '.*', dagrun.run_id): + if not dagrun or not dagrun.run_id or not match(BackfillJob.ID_PREFIX + '.*', dagrun.run_id): yield self._passing_status( - reason="Task's DagRun run_id is either NULL " + reason="Task's DagRun doesn't exist or the run_id is either NULL " "or doesn't start with {}".format(BackfillJob.ID_PREFIX)) else: yield self._failing_status( diff --git a/airflow/ti_deps/deps/not_previously_skipped_dep.py b/airflow/ti_deps/deps/not_previously_skipped_dep.py new file mode 100644 index 0000000000000..34ff6acff39c4 --- /dev/null +++ b/airflow/ti_deps/deps/not_previously_skipped_dep.py @@ -0,0 +1,88 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.ti_deps.deps.base_ti_dep import BaseTIDep + + +class NotPreviouslySkippedDep(BaseTIDep): + """ + Determines if any of the task's direct upstream relatives have decided this task should + be skipped. + """ + + NAME = "Not Previously Skipped" + IGNORABLE = True + IS_TASK_DEP = True + + def _get_dep_statuses( + self, ti, session, dep_context + ): # pylint: disable=signature-differs + from airflow.models.skipmixin import ( + SkipMixin, + XCOM_SKIPMIXIN_KEY, + XCOM_SKIPMIXIN_SKIPPED, + XCOM_SKIPMIXIN_FOLLOWED, + ) + from airflow.utils.state import State + + upstream = ti.task.get_direct_relatives(upstream=True) + + finished_tasks = dep_context.ensure_finished_tasks( + ti.task.dag, ti.execution_date, session + ) + + finished_task_ids = {t.task_id for t in finished_tasks} + + for parent in upstream: + if isinstance(parent, SkipMixin): + if parent.task_id not in finished_task_ids: + # This can happen if the parent task has not yet run. + continue + + prev_result = ti.xcom_pull( + task_ids=parent.task_id, key=XCOM_SKIPMIXIN_KEY + ) + + if prev_result is None: + # This can happen if the parent task has not yet run. + continue + + should_skip = False + if ( + XCOM_SKIPMIXIN_FOLLOWED in prev_result + and ti.task_id not in prev_result[XCOM_SKIPMIXIN_FOLLOWED] + ): + # Skip any tasks that are not in "followed" + should_skip = True + elif ( + XCOM_SKIPMIXIN_SKIPPED in prev_result + and ti.task_id in prev_result[XCOM_SKIPMIXIN_SKIPPED] + ): + # Skip any tasks that are in "skipped" + should_skip = True + + if should_skip: + # If the parent SkipMixin has run, and the XCom result stored indicates this + # ti should be skipped, set ti.state to SKIPPED and fail the rule so that the + # ti does not execute. + ti.set_state(State.SKIPPED, session) + yield self._failing_status( + reason="Skipping because of previous XCom result from parent task {}" + .format(parent.task_id) + ) + return diff --git a/airflow/ti_deps/deps/trigger_rule_dep.py b/airflow/ti_deps/deps/trigger_rule_dep.py index c7ea6452ae67b..135e351dd7cdc 100644 --- a/airflow/ti_deps/deps/trigger_rule_dep.py +++ b/airflow/ti_deps/deps/trigger_rule_dep.py @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -from sqlalchemy import case, func +from collections import Counter import airflow from airflow.ti_deps.deps.base_ti_dep import BaseTIDep @@ -34,11 +34,32 @@ class TriggerRuleDep(BaseTIDep): IGNOREABLE = True IS_TASK_DEP = True + @staticmethod + @provide_session + def _get_states_count_upstream_ti(ti, finished_tasks, session): + """ + This function returns the states of the upstream tis for a specific ti in order to determine + whether this ti can run in this iteration + + :param ti: the ti that we want to calculate deps for + :type ti: airflow.models.TaskInstance + :param finished_tasks: all the finished tasks of the dag_run + :type finished_tasks: list[airflow.models.TaskInstance] + """ + if finished_tasks is None: + # this is for the strange feature of running tasks without dag_run + finished_tasks = ti.task.dag.get_task_instances( + start_date=ti.execution_date, + end_date=ti.execution_date, + state=State.finished() + [State.UPSTREAM_FAILED], + session=session) + counter = Counter(task.state for task in finished_tasks if task.task_id in ti.task.upstream_task_ids) + return counter.get(State.SUCCESS, 0), counter.get(State.SKIPPED, 0), counter.get(State.FAILED, 0), \ + counter.get(State.UPSTREAM_FAILED, 0), sum(counter.values()) + @provide_session def _get_dep_statuses(self, ti, session, dep_context): - TI = airflow.models.TaskInstance TR = airflow.utils.trigger_rule.TriggerRule - # Checking that all upstream dependencies have succeeded if not ti.task.upstream_list: yield self._passing_status( @@ -48,34 +69,11 @@ def _get_dep_statuses(self, ti, session, dep_context): if ti.task.trigger_rule == TR.DUMMY: yield self._passing_status(reason="The task had a dummy trigger rule set.") return + # see if the task name is in the task upstream for our task + successes, skipped, failed, upstream_failed, done = self._get_states_count_upstream_ti( + ti=ti, + finished_tasks=dep_context.finished_tasks) - # TODO(unknown): this query becomes quite expensive with dags that have many - # tasks. It should be refactored to let the task report to the dag run and get the - # aggregates from there. - qry = ( - session - .query( - func.coalesce(func.sum( - case([(TI.state == State.SUCCESS, 1)], else_=0)), 0), - func.coalesce(func.sum( - case([(TI.state == State.SKIPPED, 1)], else_=0)), 0), - func.coalesce(func.sum( - case([(TI.state == State.FAILED, 1)], else_=0)), 0), - func.coalesce(func.sum( - case([(TI.state == State.UPSTREAM_FAILED, 1)], else_=0)), 0), - func.count(TI.task_id), - ) - .filter( - TI.dag_id == ti.dag_id, - TI.task_id.in_(ti.task.upstream_task_ids), - TI.execution_date == ti.execution_date, - TI.state.in_([ - State.SUCCESS, State.FAILED, - State.UPSTREAM_FAILED, State.SKIPPED]), - ) - ) - - successes, skipped, failed, upstream_failed, done = qry.first() for dep_status in self._evaluate_trigger_rule( ti=ti, successes=successes, diff --git a/airflow/typing_compat.py b/airflow/typing_compat.py new file mode 100644 index 0000000000000..f4ed7bd95a822 --- /dev/null +++ b/airflow/typing_compat.py @@ -0,0 +1,30 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +This module provides helper code to make type annotation within Airflow +codebase easier. +""" + +try: + # Protocol is only added to typing module starting from python 3.8 + # we can safely remove this shim import after Airflow drops support for + # <3.8 + from typing import Protocol # noqa # pylint: disable=unused-import +except ImportError: + from typing_extensions import Protocol # type: ignore # noqa diff --git a/airflow/upgrade/README.md b/airflow/upgrade/README.md new file mode 100644 index 0000000000000..67679db33326f --- /dev/null +++ b/airflow/upgrade/README.md @@ -0,0 +1,132 @@ + + +# Apache Airflow Upgrade Check + +[![PyPI version](https://badge.fury.io/py/apache-airflow-upgrade-check.svg)](https://badge.fury.io/py/apache-airflow-upgrade-check) +[![License](http://img.shields.io/:license-Apache%202-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.txt) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/apache-airflow-upgrade-check.svg)](https://pypi.org/project/apache-airflow-upgrade-check/) +[![PyPI - Downloads](https://img.shields.io/pypi/dm/apache-airflow-upgrade-check)](https://pypi.org/project/apache-airflow-upgrade-check/) +[![Twitter Follow](https://img.shields.io/twitter/follow/ApacheAirflow.svg?style=social&label=Follow)](https://twitter.com/ApacheAirflow) +[![Slack Status](https://img.shields.io/badge/slack-join_chat-white.svg?logo=slack&style=social)](https://s.apache.org/airflow-slack) + +This package aims to easy the upgrade journey from [Apache Airflow](https://airflow.apache.org/) 1.10 to 2.0. + +While we have put a lot of effort in to making this upgrade as painless as possible, with many changes +providing upgrade path (where the old code continues to work and prints out a deprecation warning) there were +unfortunately some breaking changes where we couldn't provide a compatibility shim. + +The recommended upgrade path to get to Airflow 2.0.0 is to first upgrade to the latest release in the 1.10 +series (at the time of writing: 1.10.15) and to then run this script. + +```bash +pip install apache-airflow-upgrade-check +airflow upgrade_check +``` + +This will then print out a number of action items that you should follow before upgrading to 2.0.0 or above. + +The exit code of the command will be 0 (success) if no problems are reported, or 1 otherwise. + +For example: + +``` +============================================= STATUS ============================================= + +Check for latest versions of apache-airflow and checker.................................SUCCESS +Legacy UI is deprecated by default......................................................SUCCESS +Users must set a kubernetes.pod_template_file value.....................................FAIL +Changes in import paths of hooks, operators, sensors and others.........................FAIL +Remove airflow.AirflowMacroPlugin class.................................................SUCCESS +Check versions of PostgreSQL, MySQL, and SQLite to ease upgrade to Airflow 2.0..........SUCCESS +Fernet is enabled by default............................................................FAIL +Logging configuration has been moved to new section.....................................SUCCESS +Connection.conn_id is not unique........................................................SUCCESS +GCP service account key deprecation.....................................................SUCCESS +Users must delete deprecated configs for KubernetesExecutor.............................FAIL +Changes in import path of remote task handlers..........................................SUCCESS +Chain between DAG and operator not allowed..............................................SUCCESS +SendGrid email uses old airflow.contrib module..........................................SUCCESS +Connection.conn_type is not nullable....................................................SUCCESS +Found 16 problems. + +======================================== RECOMMENDATIONS ========================================= + +Users must set a kubernetes.pod_template_file value +--------------------------------------------------- +In Airflow 2.0, KubernetesExecutor Users need to set a pod_template_file as a base +value for all pods launched by the KubernetesExecutor + + +Problems: + + 1. Please create a pod_template_file by running `airflow generate_pod_template`. +This will generate a pod using your aiflow.cfg settings + +... +``` + +Additionally you can use "upgrade config" to: +- specify rules you would like to ignore +- extend the check using custom rules + +For example: + +```bash +airflow upgrade_check --config=/files/upgrade.yaml +``` + +the configuration file should be a proper yaml file similar to this one: + +```yaml +ignored_rules: + - LegacyUIDeprecated + - ConnTypeIsNotNullableRule + - PodTemplateFileRule + +custom_rules: + - path.to.upgrade_module.VeryCustomCheckClass + - path.to.upgrade_module.VeryCustomCheckClass2 +``` + +## Changelog + +### 1.3.0 + +- Fix wrong warning about class that was not used in a dag file (#14700) +- Fill DagBag from `dag_folder` setting for upgrade rules (#14588) +- Bugfix: False positives for Custom Executors via Plugins check (#14680) +- Bugfix: Fix False alarm in import changes rule (#14493) +- Use `CustomSQLAInterface` instead of `SQLAInterface` (#14475) +- Fix comparing airflow version to work with older versions of packaging library (#14435) +- Fix Incorrect warning in upgrade check and error in reading file (#14344) +- Handle possible suffix in MySQL version + avoid hard-coding (#14274) + +### 1.2.0 + +- Add upgrade check option to list checks (#13392) +- Add clearer exception for read failures in macro plugin upgrade (#13371) +- Treat default value in ``HostnameCallable`` rule as good one (#13670) +- Created ``CustomExecutorsRequireFullPathRule`` class (#13678) +- Remove ``UndefinedJinjaVariableRule`` +- Created rule for ``SparkJDBCOperator`` class ``conn_id`` (#13798) +- Created ``DatabaseVersionCheckRule`` class (#13955) +- Add Version command for Upgrade Check (#12929) +- Use Tabular Format for the List of Upgrade Check Rules (#14139) +- Fix broken ``airflow upgrade_check`` command (#14137) diff --git a/airflow/upgrade/__init__.py b/airflow/upgrade/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/airflow/upgrade/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/airflow/upgrade/checker.py b/airflow/upgrade/checker.py new file mode 100644 index 0000000000000..cd81f45f4c69c --- /dev/null +++ b/airflow/upgrade/checker.py @@ -0,0 +1,135 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import +import argparse +import logging +import sys +from tabulate import tabulate +from typing import List + +from airflow.upgrade.formatters import BaseFormatter +from airflow.upgrade.problem import RuleStatus +from airflow.upgrade.rules import get_rules +from airflow.upgrade.rules.base_rule import BaseRule + +ALL_RULES = [cls() for cls in get_rules()] # type: List[BaseRule] + + +def check_upgrade(formatter, rules): + # type: (BaseFormatter, List[BaseRule]) -> List[RuleStatus] + formatter.start_checking(rules) + all_rule_statuses = [] # List[RuleStatus] + for rule in rules: + rule_status = RuleStatus.from_rule(rule) + all_rule_statuses.append(rule_status) + formatter.on_next_rule_status(rule_status) + formatter.end_checking(all_rule_statuses) + return all_rule_statuses + + +def list_checks(): + print() + rules = ( + (rule.__class__.__name__, rule.title) + for rule in ALL_RULES + ) + print(tabulate(rules, headers=["Rule Name", "Description"])) + print() + + +def register_arguments(subparser): + subparser.add_argument( + "-s", "--save", + help="Saves the result to the indicated file. The file format is determined by the file extension." + ) + subparser.add_argument( + "-i", "--ignore", + help="Ignore a rule. Can be used multiple times.", + action="append", + ) + subparser.add_argument( + "-c", "--config", + help="Path to upgrade check config yaml file.", + ) + subparser.add_argument( + "-l", "--list", + help="List the upgrade checks and their class names", + action="store_true", + ) + subparser.add_argument( + "-V", "--version", + help="Show the version of upgrade_check", + action="store_true", + ) + subparser.set_defaults(func=run) + + +def run(args): + from airflow.upgrade.formatters import ConsoleFormatter, JSONFormatter + from airflow.upgrade.config import UpgradeConfig + + if args.version: + from airflow.upgrade.version import version + print(version) + return + + if args.list: + list_checks() + return + + if args.save: + filename = args.save + if not filename.lower().endswith(".json"): + exit("Only JSON files are supported") + formatter = JSONFormatter(args.save) + else: + formatter = ConsoleFormatter() + + rules = ALL_RULES + ignored_rules = args.ignore or [] + + if args.config: + print("Using config file:", args.config) + upgrade_config = UpgradeConfig.read(path=args.config) + rules.extend(upgrade_config.get_custom_rules()) + ignored_rules.extend(upgrade_config.get_ignored_rules()) + + rules = [r for r in rules if r.__class__.__name__ not in ignored_rules] + + # Disable ERROR and below logs to avoid them in console output. + # We want to show only output of upgrade_check command + logging.disable(logging.ERROR) + + all_rule_statuses = check_upgrade(formatter, rules) + any_problems = any(rule.is_problem for rule in all_rule_statuses) + if any_problems: + sys.exit(1) + + +def __main__(): + parser = argparse.ArgumentParser() + register_arguments(parser) + args = parser.parse_args() + if args.list: + list_checks() + else: + run(args) + + +if __name__ == "__main__": + __main__() diff --git a/airflow/upgrade/config.py b/airflow/upgrade/config.py new file mode 100644 index 0000000000000..54c1352cc64a0 --- /dev/null +++ b/airflow/upgrade/config.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import +import yaml +from jsonschema import validate + +from airflow.utils.module_loading import import_string + +SCHEMA = { + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "properties": { + "ignored_rules": {"type": ["array", "null"], "items": {"type": "string"}}, + "custom_rules": {"type": ["array", "null"], "items": {"type": "string"}}, + }, + "additionalProperties": False, +} + + +class UpgradeConfig(object): + def __init__(self, raw_config): + self._raw_config = raw_config + + def get_ignored_rules(self): + return self._raw_config.get("ignored_rules") or [] + + def get_custom_rules(self): + custom_rules = self._raw_config.get("custom_rules") or [] + return [import_string(r)() for r in custom_rules] + + @classmethod + def read(cls, path): + with open(path) as f: + raw_config = yaml.safe_load(f) + validate(raw_config, schema=SCHEMA) + return cls(raw_config) diff --git a/airflow/upgrade/formatters.py b/airflow/upgrade/formatters.py new file mode 100644 index 0000000000000..4b2c5c362a77f --- /dev/null +++ b/airflow/upgrade/formatters.py @@ -0,0 +1,137 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from abc import ABCMeta +from typing import List +import json + +import pygments +from pygments.console import colorize +from pygments.formatters.terminal import TerminalFormatter +from pygments.lexers.markup import RstLexer + +from airflow.upgrade.problem import RuleStatus +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.utils.cli import header, get_terminal_size + + +class BaseFormatter(object): + __metaclass__ = ABCMeta + + def start_checking(self, all_rules): + # type: (List[BaseRule]) -> None + pass + + def end_checking(self, rule_statuses): + # type: (List[RuleStatus]) -> None + + pass + + def on_next_rule_status(self, rule_status): + # type: (RuleStatus) -> None + pass + + +class ConsoleFormatter(BaseFormatter): + def start_checking(self, all_rules): + print() + header("STATUS", "=") + print() + + def end_checking(self, rule_statuses): + messages_count = sum( + len(rule_status.messages) + for rule_status in rule_statuses + ) + if messages_count == 1: + print("Found {} problem.".format(messages_count)) + else: + print("Found {} problems.".format(messages_count)) + print() + + if messages_count > 0: + header("RECOMMENDATIONS", "=") + print() + self.display_recommendations(rule_statuses) + else: + print("Not found any problems. World is beautiful. ") + print("You can safely update Airflow to the new version.") + + @staticmethod + def display_recommendations(rule_statuses): + for rule_status in rule_statuses: + # Show recommendations only if there are any messaged + if not rule_status.messages: + continue + + rule = rule_status.rule + lines = [rule.title, "-" * len(rule.title)] + if rule_status.skipped: + lines.extend([rule_status.messages[0]]) + else: + if rule.description: + lines.extend([rule.description]) + lines.extend([ + "", + "Problems:", + "", + ]) + lines.extend(['{:>3}. {}'.format(i, m) for i, m in enumerate(rule_status.messages, 1)]) + msg = "\n".join(lines) + + formatted_msg = pygments.highlight( + code=msg, formatter=TerminalFormatter(), lexer=RstLexer() + ) + print(formatted_msg) + + def on_next_rule_status(self, rule_status): + if rule_status.skipped: + status = colorize("yellow", "SKIPPED") + elif rule_status.is_success: + status = colorize("green", "SUCCESS") + else: + status = colorize("red", "FAIL") + status_line_fmt = self.prepare_status_line_format() + print(status_line_fmt.format(rule_status.rule.title, status)) + + @staticmethod + def prepare_status_line_format(): + _, terminal_width = get_terminal_size() + + return "{:.<" + str(terminal_width - 10) + "}{:.>10}" + + +class JSONFormatter(BaseFormatter): + def __init__(self, output_path): + self.filename = output_path + + def start_checking(self, all_rules): + print("Start looking for problems.") + + @staticmethod + def _info_from_rule_status(rule_status): + return { + "rule": type(rule_status.rule).__name__, + "title": rule_status.rule.title, + "messages": rule_status.messages, + } + + def end_checking(self, rule_statuses): + formatted_results = [self._info_from_rule_status(rs) for rs in rule_statuses] + with open(self.filename, "w+") as output_file: + json.dump(formatted_results, output_file, indent=2) + print("Saved result to: {}".format(self.filename)) diff --git a/airflow/upgrade/problem.py b/airflow/upgrade/problem.py new file mode 100644 index 0000000000000..96787b1d1a34b --- /dev/null +++ b/airflow/upgrade/problem.py @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import +from typing import NamedTuple, List, Iterable + +from airflow.upgrade.rules.base_rule import BaseRule + + +class RuleStatus(NamedTuple( + 'RuleStatus', + [ + ('rule', BaseRule), + ('messages', List[str]), + ('skipped', bool) + ] +)): + @property + def is_success(self): + return len(self.messages) == 0 + + @property + def is_problem(self): + return not self.skipped and not self.is_success + + @classmethod + def from_rule(cls, rule): + # type: (BaseRule) -> RuleStatus + msg = rule.should_skip() + if msg: + return cls(rule=rule, messages=[msg], skipped=True) + + messages = [] # type: List[str] + result = rule.check() + if isinstance(result, str): + messages = [result] + elif isinstance(result, Iterable): + messages = list(result) + return cls(rule=rule, messages=messages, skipped=False) diff --git a/airflow/upgrade/rules/__init__.py b/airflow/upgrade/rules/__init__.py new file mode 100644 index 0000000000000..c883ca217f2f4 --- /dev/null +++ b/airflow/upgrade/rules/__init__.py @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import os + + +def get_rules(): + """Automatically discover all rules""" + rule_classes = [] + path = os.path.dirname(os.path.abspath(__file__)) + for file in sorted(os.listdir(path)): + if not file.endswith(".py") or file in ("__init__.py", "base_rule.py"): + continue + py_file = file[:-3] + mod = __import__(".".join([__name__, py_file]), fromlist=[py_file]) + classes = [getattr(mod, x) for x in dir(mod) if isinstance(getattr(mod, x), type)] + for cls in classes: + bases = [b.__name__ for b in cls.__bases__] + if cls.__name__ != "BaseRule" and "BaseRule" in bases: + rule_classes.append(cls) + # Sort rules alphabetically by class name, while maintaining that the airflow version + # check should remain first + return rule_classes[:1] + sorted(rule_classes[1:], key=lambda r: r.__name__) diff --git a/airflow/upgrade/rules/aaa_airflow_version_check.py b/airflow/upgrade/rules/aaa_airflow_version_check.py new file mode 100644 index 0000000000000..ad84eb4602359 --- /dev/null +++ b/airflow/upgrade/rules/aaa_airflow_version_check.py @@ -0,0 +1,87 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This module starts with `aaa_` so that it is sorted first alphabetically, but is still a valid python module +# name (starting with digitis is not valid) + +from __future__ import absolute_import + +from packaging.version import Version +import requests + +from airflow.upgrade.rules.base_rule import BaseRule + +try: + import importlib.metadata as importlib_metadata +except ImportError: + import importlib_metadata + + +class VersionCheckRule(BaseRule): + + title = "Check for latest versions of apache-airflow and checker" + + description = """\ +Check that the latest version of apache-airflow-upgrade-check is installed, and +that you are on the latest 1.10.x release of apache-airflow.""" + + def pypi_releases(self, distname): + """ + Get all the non-dev releases of a dist from PyPI + """ + + resp = requests.get("https://pypi.org/pypi/{}/json".format(distname)) + resp.raise_for_status() + + for rel_string in resp.json()["releases"].keys(): + ver = Version(rel_string) + if ver.is_devrelease or ver.is_prerelease: + continue + yield ver + + def check(self): + + current_airflow_version = Version(__import__("airflow").__version__) + try: + upgrade_check_ver = Version( + importlib_metadata.distribution("apache-airflow-upgrade-check").version, + ) + except importlib_metadata.PackageNotFoundError: + upgrade_check_ver = Version("0.0.0") + + try: + latest_airflow_v1_release = sorted( + filter(lambda v: v.major == 1, self.pypi_releases("apache-airflow")) + )[-1] + + if current_airflow_version < latest_airflow_v1_release: + yield ( + "There is a more recent version of apache-airflow. Please upgrade to {} and re-run this" + " script" + ).format(latest_airflow_v1_release) + + latest_upgrade_check_release = sorted( + self.pypi_releases("apache-airflow-upgrade-check") + )[-1] + + if upgrade_check_ver < latest_upgrade_check_release: + yield ( + "There is a more recent version of apache-airflow-upgrade-check. Please upgrade to {}" + " and re-run this script" + ).format(latest_upgrade_check_release) + except Exception as e: + yield "Unable to go ask PyPI.org for latest release information: " + str(e) diff --git a/airflow/upgrade/rules/airflow_macro_plugin_removed.py b/airflow/upgrade/rules/airflow_macro_plugin_removed.py new file mode 100644 index 0000000000000..bb2da42285039 --- /dev/null +++ b/airflow/upgrade/rules/airflow_macro_plugin_removed.py @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from airflow import conf +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.utils.dag_processing import list_py_file_paths + + +class AirflowMacroPluginRemovedRule(BaseRule): + + title = "Remove airflow.AirflowMacroPlugin class" + + description = "The airflow.AirflowMacroPlugin class has been removed." + + MACRO_PLUGIN_CLASS = "airflow.AirflowMacroPlugin" + + def _change_info(self, file_path, line_number): + return "{} will be removed. Affected file: {} (line {})".format( + self.MACRO_PLUGIN_CLASS, file_path, line_number + ) + + def _check_file(self, file_path): + problems = [] + class_name_to_check = self.MACRO_PLUGIN_CLASS.split(".")[-1] + with open(file_path, "r") as file_pointer: + try: + for line_number, line in enumerate(file_pointer, 1): + if class_name_to_check in line: + problems.append(self._change_info(file_path, line_number)) + except UnicodeDecodeError: + problems.append("Unable to read python file {}".format(file_path)) + return problems + + def check(self): + dag_folder = conf.get("core", "dags_folder") + file_paths = list_py_file_paths(directory=dag_folder, include_examples=False) + problems = [] + for file_path in file_paths: + if not file_path.endswith(".py"): + continue + problems.extend(self._check_file(file_path)) + return problems diff --git a/airflow/upgrade/rules/base_rule.py b/airflow/upgrade/rules/base_rule.py new file mode 100644 index 0000000000000..8cf0e0f3d5210 --- /dev/null +++ b/airflow/upgrade/rules/base_rule.py @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from abc import abstractmethod + + +class BaseRule(object): + @property + @abstractmethod + def title(self): + # type: () -> str + """Short one-line summary""" + pass + + @property + @abstractmethod + def description(self): + # type: () -> str + """A long description explaining the problem in detail. This can be an entry from UPDATING.md file.""" + pass + + def should_skip(self): + """ + Executes a pre check of configuration. If returned value is + True then the checking the rule is omitted. + """ + pass + + def check(self): + pass diff --git a/airflow/upgrade/rules/chain_between_dag_and_operator_not_allowed_rule.py b/airflow/upgrade/rules/chain_between_dag_and_operator_not_allowed_rule.py new file mode 100644 index 0000000000000..3291c0355a53e --- /dev/null +++ b/airflow/upgrade/rules/chain_between_dag_and_operator_not_allowed_rule.py @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import absolute_import + +import re +import os +from airflow import conf +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.utils.dag_processing import list_py_file_paths + + +class ChainBetweenDAGAndOperatorNotAllowedRule(BaseRule): + + title = "Chain between DAG and operator not allowed." + + description = "Assigning task to a DAG using bitwise shift (bit-shift) operators are no longer supported." + + def _change_info(self, file_path, line_number): + return "{} Affected file: {} (line {})".format( + self.title, file_path, line_number + ) + + def _check_file(self, file_path): + problems = [] + with open(file_path, "r") as file_pointer: + try: + lines = file_pointer.readlines() + + python_space = r"\s*\\?\s*\n?\s*" + # Find all the dag variable names. + dag_vars = re.findall(r"([A-Za-z0-9_]+){}={}DAG\(".format(python_space, python_space), + "".join(lines)) + history = "" + for line_number, line in enumerate(lines, 1): + # Someone could have put the bitshift operator on a different line than the dag they + # were using it on, so search for dag >> or << dag in all previous lines that did + # not contain a logged issue. + history += line + matches = [ + re.search(r"DAG\([^\)]+\){}>>".format(python_space), history), + re.search(r"<<{}DAG\(".format(python_space), history) + ] + for dag_var in dag_vars: + matches.extend([ + re.search(r"(\s|^){}{}>>".format(dag_var, python_space), history), + re.search(r"<<\s*{}{}".format(python_space, dag_var), history), + ]) + if any(matches): + problems.append(self._change_info(file_path, line_number)) + # If we found a problem, clear our history so we don't re-log the problem + # on the next line. + history = "" + except UnicodeDecodeError: + problems.append("Unable to read python file {}".format(file_path)) + return problems + + def check(self): + dag_folder = conf.get("core", "dags_folder") + file_paths = list_py_file_paths(directory=dag_folder, include_examples=False) + file_paths = [file for file in file_paths if os.path.splitext(file)[1] == ".py"] + problems = [] + for file_path in file_paths: + problems.extend(self._check_file(file_path)) + return problems diff --git a/airflow/upgrade/rules/conn_id_is_unique.py b/airflow/upgrade/rules/conn_id_is_unique.py new file mode 100644 index 0000000000000..edb53c844b741 --- /dev/null +++ b/airflow/upgrade/rules/conn_id_is_unique.py @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from sqlalchemy import func +from airflow.models import Connection +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.utils.db import provide_session + + +class UniqueConnIdRule(BaseRule): + title = "Connection.conn_id is not unique" + + description = """\ +The `id` column in the `connection` table must be unique. Previously, this rule was \ +enforced by application logic, but was not enforced by the database schema. + +If you made any modifications to the table directly, make sure you don't have \ +duplicate values in conn_id column. + """ + + @provide_session + def check(self, session=None): + invalid_connections = session.query(Connection.conn_id)\ + .group_by(Connection.conn_id)\ + .having(func.count() > 1) + return ( + 'Connection.conn_id={} is not unique.'.format(conn_id) + for conn_id in invalid_connections.all() + ) diff --git a/airflow/upgrade/rules/conn_type_is_not_nullable.py b/airflow/upgrade/rules/conn_type_is_not_nullable.py new file mode 100644 index 0000000000000..a411eb0028c30 --- /dev/null +++ b/airflow/upgrade/rules/conn_type_is_not_nullable.py @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from airflow.models import Connection +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.utils.db import provide_session + + +class ConnTypeIsNotNullableRule(BaseRule): + + title = "Connection.conn_type is not nullable" + + description = """\ +The `conn_type` column in the `connection` table must contain content. Previously, this rule was \ +enforced by application logic, but was not enforced by the database schema. + +If you made any modifications to the table directly, make sure you don't have null in the conn_type column.\ +""" + + @provide_session + def check(self, session=None): + invalid_connections = session.query(Connection).filter( + Connection.conn_type.is_(None) + ) + return ( + 'Connection have empty conn_type field.'.format( + conn.id, conn.conn_id + ) + for conn in invalid_connections.all() + ) diff --git a/airflow/upgrade/rules/custom_executors_require_full_path_rule.py b/airflow/upgrade/rules/custom_executors_require_full_path_rule.py new file mode 100644 index 0000000000000..55e9d5d4f9a34 --- /dev/null +++ b/airflow/upgrade/rules/custom_executors_require_full_path_rule.py @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.upgrade.rules.base_rule import BaseRule + + +class CustomExecutorsRequireFullPathRule(BaseRule): + """ + CustomExecutorsRequireFullPathRule class to ease upgrade to Airflow 2.0 + """ + title = "Custom Executors now require full path" + description = """\ +In Airflow-2.0, loading custom executors via plugins is no longer required. +To load a custom executor, you have to provide a full path to the the custom executor module. + """ + + def check(self): + from airflow.plugins_manager import plugins + executors_via_plugins = [] + + for plugin in plugins: + if plugin.executors: + for executor in plugin.executors: + executors_via_plugins.append(executor.__name__) + + if executors_via_plugins: + return ( + "Deprecation Warning: Found Custom Executor imported via a plugin." + "From Airflow 2.0, you should use regular Python Modules to import Custom Executor." + "You should provide a full path to the the custom executor module." + "See the link below for more details:" + "https://github.com/apache/airflow/blob/2.0.0/" + "UPDATING.md#custom-executors-is-loaded-using-full-import-path \n" + "Following Executors were imported using Plugins: \n" + "{}".format(executors_via_plugins) + ) diff --git a/airflow/upgrade/rules/custom_operator_metaclass_rule.py b/airflow/upgrade/rules/custom_operator_metaclass_rule.py new file mode 100644 index 0000000000000..6afcf360d0acf --- /dev/null +++ b/airflow/upgrade/rules/custom_operator_metaclass_rule.py @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from airflow.configuration import conf +from airflow.models.dagbag import DagBag +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.utils.db import provide_session + + +def check_task_for_metaclasses(task): + class_type = type(task.__class__) + if class_type != type: + res = ( + "Class {class_name} contained invalid custom metaclass " + "{metaclass_name}. Custom metaclasses for operators are not " + "allowed in Airflow 2.0. Please remove this custom metaclass.".format( + class_name=task.__class__, metaclass_name=class_type + ) + ) + return res + else: + return None + + +class BaseOperatorMetaclassRule(BaseRule): + title = "Ensure users are not using custom metaclasses in custom operators" + + description = """\ +In Airflow 2.0, we require that all custom operators use the BaseOperatorMeta metaclass.\ +To ensure this, we can no longer allow custom metaclasses in custom operators. + """ + + @provide_session + def check(self, session=None): + dagbag = DagBag(dag_folder=conf.get("core", "dags_folder"), include_examples=False) + for dag_id, dag in dagbag.dags.items(): + for task in dag.tasks: + res = check_task_for_metaclasses(task) + if res: + yield res diff --git a/airflow/upgrade/rules/db_api_functions.py b/airflow/upgrade/rules/db_api_functions.py new file mode 100644 index 0000000000000..0ca730fd287f7 --- /dev/null +++ b/airflow/upgrade/rules/db_api_functions.py @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.hooks.base_hook import BaseHook +from airflow.upgrade.rules.base_rule import BaseRule + + +def check_get_pandas_df(cls): + try: + cls.__new__(cls).get_pandas_df("fake SQL") + return return_error_string(cls, "get_pandas_df") + except NotImplementedError: + pass + except Exception: + return return_error_string(cls, "get_pandas_df") + + +def check_run(cls): + try: + cls.__new__(cls).run("fake SQL") + return return_error_string(cls, "run") + except Exception: + pass + + +def check_get_records(cls): + try: + cls.__new__(cls).get_records("fake SQL") + return return_error_string(cls, "get_records") + except Exception: + pass + + +def return_error_string(cls, method): + return ( + "Class {} incorrectly implements the function {} while inheriting from BaseHook. " + "Please make this class inherit from airflow.hooks.db_api_hook.DbApiHook instead".format( + cls, method + ) + ) + + +def get_all_non_dbapi_children(): + basehook_children = [ + child for child in BaseHook.__subclasses__() if child.__name__ != "DbApiHook" + ] + res = basehook_children[:] + while basehook_children: + next_generation = [] + for child in basehook_children: + subclasses = child.__subclasses__() + if subclasses: + next_generation.extend(subclasses) + res.extend(next_generation) + basehook_children = next_generation + return res + + +class DbApiRule(BaseRule): + title = "Hooks that run DB functions must inherit from DBApiHook" + + description = ( + "Hooks that run DB functions must inherit from DBApiHook instead of BaseHook" + ) + + def check(self): + basehook_subclasses = get_all_non_dbapi_children() + incorrect_implementations = [] + for child in basehook_subclasses: + pandas_df = check_get_pandas_df(child) + if pandas_df: + incorrect_implementations.append(pandas_df) + run = check_run(child) + if run: + incorrect_implementations.append(run) + get_records = check_get_records(child) + if get_records: + incorrect_implementations.append(get_records) + return incorrect_implementations diff --git a/airflow/upgrade/rules/fernet_enabled.py b/airflow/upgrade/rules/fernet_enabled.py new file mode 100644 index 0000000000000..b5f99470e3104 --- /dev/null +++ b/airflow/upgrade/rules/fernet_enabled.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from airflow.configuration import conf +from airflow.upgrade.rules.base_rule import BaseRule + + +class FernetEnabledRule(BaseRule): + title = "Fernet is enabled by default" + + description = ( + "The fernet mechanism is enabled by default " + "to increase the security of the default installation." + ) + + def check(self): + fernet_key = conf.get("core", "fernet_key") + if not fernet_key: + return ( + "fernet_key in airflow.cfg must be explicitly set empty as fernet mechanism is enabled" + "by default. This means that the apache-airflow[crypto] extra-packages are always installed." + "However, this requires that your operating system has libffi-dev installed." + ) diff --git a/airflow/upgrade/rules/gcp_service_account_keys_rule.py b/airflow/upgrade/rules/gcp_service_account_keys_rule.py new file mode 100644 index 0000000000000..be462f18e0b75 --- /dev/null +++ b/airflow/upgrade/rules/gcp_service_account_keys_rule.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from airflow.configuration import conf +from airflow.upgrade.rules.base_rule import BaseRule + + +class GCPServiceAccountKeyRule(BaseRule): + title = "GCP service account key deprecation" + + description = """Option has been removed because it is no longer \ +supported by the Google Kubernetes Engine.""" + + def check(self): + gcp_option = conf.get(section="kubernetes", key="gcp_service_account_keys") + if gcp_option: + msg = """This option has been removed because it is no longer \ +supported by the Google Kubernetes Engine. The new recommended \ +service account keys for the Google Cloud management method is \ +Workload Identity.""" + return [msg] + else: + return None diff --git a/airflow/upgrade/rules/hostname_callable_rule.py b/airflow/upgrade/rules/hostname_callable_rule.py new file mode 100644 index 0000000000000..a316e526233d4 --- /dev/null +++ b/airflow/upgrade/rules/hostname_callable_rule.py @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from airflow.configuration import conf +from airflow.upgrade.rules.base_rule import BaseRule + + +class HostnameCallable(BaseRule): + title = "Unify hostname_callable option in core section" + + description = "hostname_callable option is using now only dots instead of dots and colons" + + def check(self): + default = "socket:getfqdn" + hostname_callable_conf = conf.get("core", "hostname_callable") + if hostname_callable_conf == default: + # If users use default value there's nothing they should do + return None + + if ":" in hostname_callable_conf: + return ( + "Error: hostname_callable `{}` " + "contains a colon instead of a dot. please change to `{}`".format( + hostname_callable_conf, hostname_callable_conf.replace(":", ".") + ) + ) + return None diff --git a/airflow/upgrade/rules/import_changes.py b/airflow/upgrade/rules/import_changes.py new file mode 100644 index 0000000000000..e9d0166ee9284 --- /dev/null +++ b/airflow/upgrade/rules/import_changes.py @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import itertools +from typing import NamedTuple, Optional, List +import os +from cached_property import cached_property +from packaging.version import Version + +from airflow import conf +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.upgrade.rules.renamed_classes import ALL +from airflow.utils.dag_processing import list_py_file_paths + +try: + from importlib_metadata import PackageNotFoundError, distribution +except ImportError: + from importlib.metadata import PackageNotFoundError, distribution + + +class ImportChange( + NamedTuple( + "ImportChange", + [("old_path", str), ("new_path", str), ("providers_package", Optional[str])], + ) +): + def info(self, file_path=None): + msg = "Using `{}` should be replaced by `{}`".format( + self.old_path, self.new_path + ) + if file_path: + msg += ". Affected file: {}".format(file_path) + return msg + + @cached_property + def old_path_without_classname(self): + part = self.old_path.split(".") + part.pop() + return ".".join(part) + + @cached_property + def old_class(self): + return self.old_path.split(".")[-1] + + @cached_property + def new_class(self): + return self.new_path.split(".")[-1] + + @classmethod + def provider_stub_from_module(cls, module): + if "providers" not in module: + return None + + # [2:] strips off the airflow.providers. part + parts = module.split(".")[2:] + if parts[0] in ('apache', 'cncf', 'microsoft'): + return '-'.join(parts[:2]) + return parts[0] + + @classmethod + def from_new_old_paths(cls, new_path, old_path): + providers_package = cls.provider_stub_from_module(new_path) + return cls( + old_path=old_path, new_path=new_path, providers_package=providers_package + ) + + +class ImportChangesRule(BaseRule): + title = "Changes in import paths of hooks, operators, sensors and others" + description = ( + "Many hooks, operators and other classes has been renamed and moved. Those changes were part of " + "unifying names and imports paths as described in AIP-21.\nThe `contrib` folder has been replaced " + "by `providers` directory and packages:\n" + "https://github.com/apache/airflow#backport-packages" + ) + + current_airflow_version = Version(__import__("airflow").__version__) + + if current_airflow_version < Version("2.0.0"): + + def _filter_incompatible_renames(arg): + new_path = arg[0] + return ( + not new_path.startswith("airflow.operators") + and not new_path.startswith("airflow.sensors") + and not new_path.startswith("airflow.hooks") + ) + + else: + # Everything allowed on 2.0.0+ + def _filter_incompatible_renames(arg): + return True + + ALL_CHANGES = [ + ImportChange.from_new_old_paths(*args) + for args in filter(_filter_incompatible_renames, ALL) + ] # type: List[ImportChange] + + del _filter_incompatible_renames + + @staticmethod + def _check_file(file_path): + problems = [] + providers = set() + with open(file_path, "r") as file: + try: + content = file.read() + for change in ImportChangesRule.ALL_CHANGES: + if change.old_path_without_classname in content and change.old_class in content: + problems.append(change.info(file_path)) + if change.providers_package: + providers.add(change.providers_package) + except UnicodeDecodeError: + problems.append("Unable to read python file {}".format(file_path)) + return problems, providers + + @staticmethod + def _check_missing_providers(providers): + + current_airflow_version = Version(__import__("airflow").__version__) + if current_airflow_version >= Version("2.0.0"): + prefix = "apache-airflow-providers-" + else: + prefix = "apache-airflow-backport-providers-" + + for provider in providers: + dist_name = prefix + provider + try: + distribution(dist_name) + except PackageNotFoundError: + yield "Please install `{}`".format(dist_name) + + def check(self): + dag_folder = conf.get("core", "dags_folder") + files = list_py_file_paths(directory=dag_folder, include_examples=False) + files = [file for file in files if os.path.splitext(file)[1] == ".py"] + problems = [] + providers = set() + # Split in to two groups - install backports first, then make changes + for file in files: + new_problems, new_providers = self._check_file(file) + problems.extend(new_problems) + providers |= new_providers + + return itertools.chain( + self._check_missing_providers(sorted(providers)), + problems, + ) diff --git a/airflow/upgrade/rules/legacy_ui_deprecated.py b/airflow/upgrade/rules/legacy_ui_deprecated.py new file mode 100644 index 0000000000000..9570af7819340 --- /dev/null +++ b/airflow/upgrade/rules/legacy_ui_deprecated.py @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from airflow.configuration import conf +from airflow.upgrade.rules.base_rule import BaseRule + + +class LegacyUIDeprecated(BaseRule): + title = "Legacy UI is deprecated by default" + + description = "Legacy UI is deprecated. FAB RBAC is enabled by default in order to increase security." + + def check(self): + if conf.has_option("webserver", "rbac"): + rbac = conf.get("webserver", "rbac") + if rbac == "false": + return ( + "rbac in airflow.cfg must be explicitly set empty as" + " RBAC mechanism is enabled by default." + ) diff --git a/airflow/upgrade/rules/logging_configuration.py b/airflow/upgrade/rules/logging_configuration.py new file mode 100644 index 0000000000000..e5ed73a827f73 --- /dev/null +++ b/airflow/upgrade/rules/logging_configuration.py @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from airflow.configuration import conf, AIRFLOW_HOME +from airflow.upgrade.rules.base_rule import BaseRule + + +class LoggingConfigurationRule(BaseRule): + title = "Logging configuration has been moved to new section" + + description = "The logging configurations have been moved from [core] to the new [logging] section." + + def check(self): + logging_configs = [ + ("base_log_folder", "{}/logs".format(AIRFLOW_HOME)), + ("remote_logging", "False"), + ("remote_log_conn_id", ""), + ("remote_base_log_folder", ""), + ("encrypt_s3_logs", "False"), + ("logging_level", "INFO"), + ("fab_logging_level", "WARN"), + ("logging_config_class", ""), + ("colored_console_log", "True"), + ( + "colored_log_format", + "[%(blue)s%(asctime)s%(reset)s] {%(blue)s%(filename)s:%(reset)s%(lineno)d} " + "%(log_color)s%(levelname)s%(reset)s - %(log_color)s%(message)s%(reset)s", + ), + ( + "colored_formatter_class", + "airflow.utils.log.colored_log.CustomTTYColoredFormatter", + ), + ( + "log_format", + "[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s", + ), + ("simple_log_format", "%(asctime)s %(levelname)s - %(message)s"), + ("task_log_prefix_template", ""), + ( + "log_filename_template", + "{{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log", + ), + ("log_processor_filename_template", "{{ filename }}.log"), + ( + "dag_processor_manager_log_location", + "{}/logs/dag_processor_manager/dag_processor_manager.log".format( + AIRFLOW_HOME + ), + ), + ("task_log_reader", "task"), + ] + + mismatches = [] + for logging_config, default in logging_configs: + if not conf.has_option("logging", logging_config) and conf.has_option( + "core", logging_config + ): + existing_config = conf.get("core", logging_config) + if existing_config != default: + mismatches.append( + "{} has been moved from [core] to a the new [logging] section.".format( + logging_config + ) + ) + + return mismatches diff --git a/airflow/upgrade/rules/mesos_executor_removed.py b/airflow/upgrade/rules/mesos_executor_removed.py new file mode 100644 index 0000000000000..c0e6b52efdeb6 --- /dev/null +++ b/airflow/upgrade/rules/mesos_executor_removed.py @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.configuration import conf + + +class MesosExecutorRemovedRule(BaseRule): + """ + MesosExecutorRemovedRule class to ease upgrade to Airflow 2.0 + """ + title = "Removal of Mesos Executor" + description = "The Mesos Executor has been deprecated as it was not widely used and not maintained." + + def check(self): + executor_key = conf.get(section="core", key="executor") + if executor_key == "MesosExecutor": + return ( + "The Mesos Executor has been deprecated as it was not widely used and not maintained." + "Please migrate to any of the supported executors." + "See https://airflow.apache.org/docs/stable/executor/index.html for more details." + ) diff --git a/airflow/upgrade/rules/no_additional_args_in_operators.py b/airflow/upgrade/rules/no_additional_args_in_operators.py new file mode 100644 index 0000000000000..0a96f28f2dfd7 --- /dev/null +++ b/airflow/upgrade/rules/no_additional_args_in_operators.py @@ -0,0 +1,74 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +import os +import re +import logging +import warnings +from airflow.utils.dag_processing import correct_maybe_zipped, list_py_file_paths +from airflow import conf +from airflow.models.dagbag import DagBag +from airflow.upgrade.rules.base_rule import BaseRule + + +class NoAdditionalArgsInOperatorsRule(BaseRule): + title = "No additional argument allowed in BaseOperator." + + description = """\ +Passing unrecognized arguments to operators is not allowed in Airflow 2.0 anymore, +and will cause an exception. + """ + + def check(self, dags_folder=None): + if not dags_folder: + dags_folder = conf.get("core", "dags_folder") + + logger = logging.root + old_level = logger.level + try: + logger.setLevel(logging.ERROR) + dagbag = DagBag(dag_folder=os.devnull, include_examples=False, store_serialized_dags=False) + dags_folder = correct_maybe_zipped(dags_folder) + + # Each file in the DAG folder is parsed individually + for filepath in list_py_file_paths(dags_folder, + safe_mode=conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE'), + include_examples=False): + try: + with warnings.catch_warnings(record=True) as captured_warnings: + _ = dagbag.process_file( + filepath, + safe_mode=conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')) + except Exception: + pass + + for warning in captured_warnings: + if warning.category in (DeprecationWarning, PendingDeprecationWarning) \ + and str(warning.message).startswith("Invalid arguments were passed"): + m = re.match(r''' + .* \(task_id:\ ([^\)]+)\) .* \n + \*args:\ (.*) \n + \*\*kwargs:\ (.*) + ''', str(warning.message), re.VERBOSE) + + yield "DAG file `{}` with task_id `{}` has unrecognized positional args `{}`" \ + "and keyword args " \ + "`{}`".format(filepath, m.group(1), m.group(2), m.group(3)) + finally: + logger.setLevel(old_level) diff --git a/airflow/upgrade/rules/pod_template_file_rule.py b/airflow/upgrade/rules/pod_template_file_rule.py new file mode 100644 index 0000000000000..148d34dad3207 --- /dev/null +++ b/airflow/upgrade/rules/pod_template_file_rule.py @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.configuration import conf + +invalid_config_keys = { + "airflow_configmap", + "airflow_local_settings_configmap", + "dags_in_image", + "dags_volume_subpath", + "dags_volume_mount_point", "dags_volume_claim", + "logs_volume_subpath", "logs_volume_claim", + "dags_volume_host", "logs_volume_host", + "env_from_configmap_ref", "env_from_secret_ref", "git_repo", + "git_branch", "git_sync_depth", "git_subpath", + "git_sync_rev", "git_user", "git_password", + "git_sync_root", "git_sync_dest", + "git_dags_folder_mount_point", "git_ssh_key_secret_name", + "git_ssh_known_hosts_configmap_name", "git_sync_credentials_secret", + "git_sync_container_repository", + "git_sync_container_tag", "git_sync_init_container_name", + "git_sync_run_as_user", + "worker_service_account_name", "image_pull_secrets", + "gcp_service_account_keys", "affinity", + "tolerations", "run_as_user", "fs_group" +} + + +class PodTemplateFileRule(BaseRule): + title = "Users must set a kubernetes.pod_template_file value" + + description = """\ +In Airflow 2.0, KubernetesExecutor Users need to set a pod_template_file as a base +value for all pods launched by the KubernetesExecutor. Many Kubernetes configs are no longer +needed once this pod_template_file has been generated. +""" + + def should_skip(self): + # Check this rule only if users use KubernetesExecutor + if conf.get("core", "executor") != "KubernetesExecutor": + return "Skipped because this rule applies only to environment using KubernetesExecutor." + + def check(self): + pod_template_file = conf.get("kubernetes", "pod_template_file", fallback=None) + if not pod_template_file: + return ( + "Please create a pod_template_file by running `airflow generate_pod_template`.\n" + "This will generate a pod using your aiflow.cfg settings" + ) + + conf_dict = conf.as_dict(display_sensitive=True) + kube_conf = conf_dict['kubernetes'] + keys = kube_conf.keys() + resp = [k for k in keys if k in invalid_config_keys] + if conf_dict['kubernetes_labels']: + resp.append("kubernetes_labels") + if conf_dict['kubernetes_secrets']: + resp.append("kubernetes_secrets") + + if resp: + resp_string = "\n".join(resp) + return "The following invalid keys were found in your airflow.cfg: \ + \n\n{resp_string}\n\n \ + Now that you have a pod_template_file, these keys no longer do anything.\n\ + Please delete these keys.".format(resp_string=resp_string) diff --git a/airflow/upgrade/rules/postgres_mysql_sqlite_version_upgrade_check.py b/airflow/upgrade/rules/postgres_mysql_sqlite_version_upgrade_check.py new file mode 100644 index 0000000000000..60206655fa2c1 --- /dev/null +++ b/airflow/upgrade/rules/postgres_mysql_sqlite_version_upgrade_check.py @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from packaging.version import Version + +from airflow.configuration import conf +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.utils.db import provide_session + + +class DatabaseVersionCheckRule(BaseRule): + title = "Check versions of PostgreSQL, MySQL, and SQLite to ease upgrade to Airflow 2.0" + + description = """\ +From Airflow 2.0, the following database versions are supported: +PostgreSQl - 9.6, 10, 11, 12, 13; +MySQL - 5.7, 8; +SQLite - 3.15+ + """ + + @provide_session + def check(self, session=None): + + more_info = "See link below for more details: https://github.com/apache/airflow#requirements" + + conn_str = conf.get(section="core", key="sql_alchemy_conn") + + if "sqlite" in conn_str: + min_req_sqlite_version = Version('3.15') + installed_sqlite_version = Version(session.execute('select sqlite_version();').scalar()) + if installed_sqlite_version < min_req_sqlite_version: + return "From Airflow 2.0, SQLite version below {} is no longer supported. \n{}".format( + min_req_sqlite_version, more_info + ) + + elif "postgres" in conn_str: + min_req_postgres_version = Version('9.6') + installed_postgres_version = Version(session.execute('SHOW server_version;').scalar()) + if installed_postgres_version < min_req_postgres_version: + return "From Airflow 2.0, PostgreSQL version below {} is no longer supported. \n{}".format( + min_req_postgres_version, more_info + ) + + elif "mysql" in conn_str: + min_req_mysql_version = Version('5.7') + # special treatment is needed here, because MySQL version may include a suffix like '-log' + installed_mysql_version = Version(session.execute('SELECT VERSION();').scalar().split('-')[0]) + if installed_mysql_version < min_req_mysql_version: + return "From Airflow 2.0, MySQL version below {} is no longer supported. \n{}".format( + min_req_mysql_version, more_info + ) diff --git a/airflow/upgrade/rules/renamed_classes.py b/airflow/upgrade/rules/renamed_classes.py new file mode 100644 index 0000000000000..02c0a75d00980 --- /dev/null +++ b/airflow/upgrade/rules/renamed_classes.py @@ -0,0 +1,1739 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +HOOKS = [ + ( + "airflow.providers.apache.cassandra.hooks.cassandra.CassandraHook", + "airflow.contrib.hooks.cassandra_hook.CassandraHook", + ), + ( + "airflow.providers.google.cloud.hooks.compute.ComputeEngineHook", + "airflow.contrib.hooks.gcp_compute_hook.GceHook", + ), + ( + "airflow.providers.google.common.hooks.base_google.GoogleBaseHook", + "airflow.contrib.hooks.gcp_api_base_hook.GoogleBaseHook", + ), + ( + "airflow.providers.google.cloud.hooks.dataflow.DataflowHook", + "airflow.contrib.hooks.gcp_dataflow_hook.DataFlowHook", + ), + ( + "airflow.providers.google.cloud.hooks.dataproc.DataprocHook", + "airflow.contrib.hooks.gcp_dataproc_hook.DataProcHook", + ), + ( + "airflow.providers.google.cloud.hooks.dlp.CloudDLPHook", + "airflow.contrib.hooks.gcp_dlp_hook.CloudDLPHook", + ), + ( + "airflow.providers.google.cloud.hooks.functions.CloudFunctionsHook", + "airflow.contrib.hooks.gcp_function_hook.GcfHook", + ), + ( + "airflow.providers.google.cloud.hooks.kms.CloudKMSHook", + "airflow.contrib.hooks.gcp_kms_hook.GoogleCloudKMSHook", + ), + ( + "airflow.providers.google.cloud.hooks.mlengine.MLEngineHook", + "airflow.contrib.hooks.gcp_mlengine_hook.MLEngineHook", + ), + ( + "airflow.providers.google.cloud.hooks.spanner.SpannerHook", + "airflow.contrib.hooks.gcp_spanner_hook.CloudSpannerHook", + ), + ( + "airflow.providers.google.cloud.hooks.speech_to_text.CloudSpeechToTextHook", + "airflow.contrib.hooks.gcp_speech_to_text_hook.GCPSpeechToTextHook", + ), + ( + "airflow.providers.google.cloud.hooks.text_to_speech.CloudTextToSpeechHook", + "airflow.contrib.hooks.gcp_text_to_speech_hook.GCPTextToSpeechHook", + ), + ( + "airflow.providers.google.cloud.hooks.gcs.GCSHook", + "airflow.contrib.hooks.gcs_hook.GoogleCloudStorageHook", + ), + ( + "airflow.providers.google.cloud.hooks.cloud_build.CloudBuildHook", + "airflow.contrib.hooks.gcp_cloud_build_hook.CloudBuildHook", + ), + ( + "airflow.providers.google.cloud.hooks.bigtable.BigtableHook", + "airflow.contrib.hooks.gcp_bigtable_hook.BigtableHook", + ), + ( + "airflow.providers.google.cloud.hooks.kubernetes_engine.GKEHook", + "airflow.contrib.hooks.gcp_container_hook.GKEClusterHook", + ), + ( + "airflow.providers.google.cloud.hooks.datastore.DatastoreHook", + "airflow.contrib.hooks.datastore_hook.DatastoreHook", + ), + ( + "airflow.providers.google.cloud.hooks.natural_language.CloudNaturalLanguageHook", + "airflow.contrib.hooks.gcp_natural_language_hook.CloudNaturalLanguageHook", + ), + ( + "airflow.providers.google.cloud.hooks.pubsub.PubSubHook", + "airflow.contrib.hooks.gcp_pubsub_hook.PubSubHook", + ), + ( + "airflow.providers.google.cloud.hooks.cloud_sql.CloudSQLHook", + "airflow.contrib.hooks.gcp_sql_hook.CloudSqlHook", + ), + ( + "airflow.providers.google.cloud.hooks.cloud_sql.CloudSQLDatabaseHook", + "airflow.contrib.hooks.gcp_sql_hook.CloudSqlDatabaseHook", + ), + ( + "airflow.providers.google.cloud.hooks.tasks.CloudTasksHook", + "airflow.contrib.hooks.gcp_tasks_hook.CloudTasksHook", + ), + ( + "airflow.providers.google.cloud.hooks.cloud_storage_transfer_service.CloudDataTransferServiceHook", + "airflow.contrib.hooks.gcp_transfer_hook.GCPTransferServiceHook", + ), + ( + "airflow.providers.google.cloud.hooks.translate.CloudTranslateHook", + "airflow.contrib.hooks.gcp_translate_hook.CloudTranslateHook", + ), + ( + "airflow.providers.google.cloud.hooks.video_intelligence.CloudVideoIntelligenceHook", + "airflow.contrib.hooks.gcp_video_intelligence_hook.CloudVideoIntelligenceHook", + ), + ( + "airflow.providers.google.cloud.hooks.vision.CloudVisionHook", + "airflow.contrib.hooks.gcp_vision_hook.CloudVisionHook", + ), + ( + "airflow.providers.google.cloud.hooks.bigquery.BigQueryHook", + "airflow.contrib.hooks.bigquery_hook.BigQueryHook", + ), + ( + "airflow.providers.amazon.aws.hooks.athena.AWSAthenaHook", + "airflow.contrib.hooks.aws_athena_hook.AWSAthenaHook", + ), + ( + "airflow.providers.amazon.aws.hooks.datasync.AWSDataSyncHook", + "airflow.contrib.hooks.aws_datasync_hook.AWSDataSyncHook", + ), + ("airflow.providers.amazon.aws.hooks.s3.S3Hook", "airflow.hooks.S3_hook.S3Hook"), + ( + "airflow.providers.amazon.aws.hooks.sqs.SQSHook", + "airflow.contrib.hooks.aws_sqs_hook.SQSHook", + ), + ( + "airflow.providers.amazon.aws.hooks.lambda_function.AwsLambdaHook", + "airflow.contrib.hooks.aws_lambda_hook.AwsLambdaHook", + ), + ( + "airflow.providers.amazon.aws.hooks.sns.AwsSnsHook", + "airflow.contrib.hooks.aws_sns_hook.AwsSnsHook", + ), + ( + "airflow.providers.apache.pinot.hooks.pinot.PinotDbApiHook", + "airflow.contrib.hooks.pinot_hook.PinotDbApiHook", + ), + ( + "airflow.providers.apache.pinot.hooks.pinot.PinotAdminHook", + "airflow.contrib.hooks.pinot_hook.PinotAdminHook", + ), + ( + "airflow.providers.apache.spark.hooks.spark_jdbc.SparkJDBCHook", + "airflow.contrib.hooks.spark_jdbc_hook.SparkJDBCHook", + ), + ( + "airflow.providers.apache.spark.hooks.spark_sql.SparkSqlHook", + "airflow.contrib.hooks.spark_sql_hook.SparkSqlHook", + ), + ( + "airflow.providers.apache.spark.hooks.spark_submit.SparkSubmitHook", + "airflow.contrib.hooks.spark_submit_hook.SparkSubmitHook", + ), + ( + "airflow.providers.apache.sqoop.hooks.sqoop.SqoopHook", + "airflow.contrib.hooks.sqoop_hook.SqoopHook", + ), + ( + "airflow.providers.apache.druid.hooks.druid.DruidHook", + "airflow.hooks.druid_hook.DruidHook", + ), + ( + "airflow.providers.apache.druid.hooks.druid.DruidDbApiHook", + "airflow.hooks.druid_hook.DruidDbApiHook", + ), + ( + "airflow.providers.apache.hdfs.hooks.hdfs.HDFSHookException", + "airflow.hooks.hdfs_hook.HDFSHookException", + ), + ( + "airflow.providers.apache.hdfs.hooks.hdfs.HDFSHook", + "airflow.hooks.hdfs_hook.HDFSHook", + ), + ( + "airflow.providers.apache.hive.hooks.hive.HiveMetastoreHook", + "airflow.hooks.hive_hooks.HiveMetastoreHook", + ), + ( + "airflow.providers.apache.hive.hooks.hive.HiveCliHook", + "airflow.hooks.hive_hooks.HiveCliHook", + ), + ( + "airflow.providers.apache.hive.hooks.hive.HiveServer2Hook", + "airflow.hooks.hive_hooks.HiveServer2Hook", + ), + ( + "airflow.providers.apache.pig.hooks.pig.PigCliHook", + "airflow.hooks.pig_hook.PigCliHook", + ), + ( + "airflow.providers.apache.hdfs.hooks.webhdfs.WebHDFSHook", + "airflow.hooks.webhdfs_hook.WebHDFSHook", + ), + ("airflow.hooks.filesystem.FSHook", "airflow.contrib.hooks.fs_hook.FSHook"), + ( + "airflow.providers.microsoft.azure.hooks.azure_container_instance.AzureContainerInstanceHook", + "airflow.contrib.hooks.azure_container_instance_hook.AzureContainerInstanceHook", + ), + ( + "airflow.providers.microsoft.azure.hooks.azure_container_registry.AzureContainerRegistryHook", + "airflow.contrib.hooks.azure_container_registry_hook.AzureContainerRegistryHook", + ), + ( + "airflow.providers.microsoft.azure.hooks.azure_container_volume.AzureContainerVolumeHook", + "airflow.contrib.hooks.azure_container_volume_hook.AzureContainerVolumeHook", + ), + ( + "airflow.providers.microsoft.azure.hooks.azure_cosmos.AzureCosmosDBHook", + "airflow.contrib.hooks.azure_cosmos_hook.AzureCosmosDBHook", + ), + ( + "airflow.providers.microsoft.azure.hooks.azure_fileshare.AzureFileShareHook", + "airflow.contrib.hooks.azure_fileshare_hook.AzureFileShareHook", + ), + ( + "airflow.providers.microsoft.azure.hooks.wasb.WasbHook", + "airflow.contrib.hooks.wasb_hook.WasbHook", + ), + ( + "airflow.providers.amazon.aws.hooks.glue_catalog.AwsGlueCatalogHook", + "airflow.contrib.hooks.aws_glue_catalog_hook.AwsGlueCatalogHook", + ), + ( + "airflow.providers.amazon.aws.hooks.logs.AwsLogsHook", + "airflow.contrib.hooks.aws_logs_hook.AwsLogsHook", + ), + ( + "airflow.providers.amazon.aws.hooks.emr.EmrHook", + "airflow.contrib.hooks.emr_hook.EmrHook", + ), + ( + "airflow.providers.amazon.aws.hooks.sagemaker.SageMakerHook", + "airflow.contrib.hooks.sagemaker_hook.SageMakerHook", + ), + ( + "airflow.providers.mongo.hooks.mongo.MongoHook", + "airflow.contrib.hooks.mongo_hook.MongoHook", + ), + ( + "airflow.providers.openfaas.hooks.openfaas.OpenFaasHook", + "airflow.contrib.hooks.openfaas_hook.OpenFaasHook", + ), + ( + "airflow.providers.redis.hooks.redis.RedisHook", + "airflow.contrib.hooks.redis_hook.RedisHook", + ), + ( + "airflow.providers.docker.hooks.docker.DockerHook", + "airflow.hooks.docker_hook.DockerHook", + ), + ( + "airflow.providers.microsoft.mssql.hooks.mssql.MsSqlHook", + "airflow.hooks.mssql_hook.MsSqlHook", + ), + ( + "airflow.providers.mysql.hooks.mysql.MySqlHook", + "airflow.hooks.mysql_hook.MySqlHook", + ), + ( + "airflow.providers.oracle.hooks.oracle.OracleHook", + "airflow.hooks.oracle_hook.OracleHook", + ), + ( + "airflow.providers.postgres.hooks.postgres.PostgresHook", + "airflow.hooks.postgres_hook.PostgresHook", + ), + ( + "airflow.providers.presto.hooks.presto.PrestoHook", + "airflow.hooks.presto_hook.PrestoHook", + ), + ( + "airflow.providers.samba.hooks.samba.SambaHook", + "airflow.hooks.samba_hook.SambaHook", + ), + ( + "airflow.providers.sqlite.hooks.sqlite.SqliteHook", + "airflow.hooks.sqlite_hook.SqliteHook", + ), + ( + "airflow.providers.cloudant.hooks.cloudant.CloudantHook", + "airflow.contrib.hooks.cloudant_hook.CloudantHook", + ), + ( + "airflow.providers.databricks.hooks.databricks.DatabricksHook", + "airflow.contrib.hooks.databricks_hook.DatabricksHook", + ), + ( + "airflow.providers.databricks.hooks.databricks.DatabricksHook", + "airflow.contrib.hooks.databricks_hook.DatabricksHook", + ), + ( + "airflow.providers.datadog.hooks.datadog.DatadogHook", + "airflow.contrib.hooks.datadog_hook.DatadogHook", + ), + ( + "airflow.providers.dingding.hooks.dingding.DingdingHook", + "airflow.contrib.hooks.dingding_hook.DingdingHook", + ), + ( + "airflow.providers.discord.hooks.discord_webhook.DiscordWebhookHook", + "airflow.contrib.hooks.discord_webhook_hook.DiscordWebhookHook", + ), + ( + "airflow.providers.google.suite.hooks.drive.GoogleDriveHook", + "airflow.contrib.hooks.gdrive_hook.GoogleDriveHook", + ), + ( + "airflow.providers.jenkins.hooks.jenkins.JenkinsHook", + "airflow.contrib.hooks.jenkins_hook.JenkinsHook", + ), + ( + "airflow.providers.jira.hooks.jira.JiraHook", + "airflow.contrib.hooks.jira_hook.JiraHook", + ), + ( + "airflow.providers.opsgenie.hooks.opsgenie_alert.OpsgenieAlertHook", + "airflow.contrib.hooks.opsgenie_alert_hook.OpsgenieAlertHook", + ), + ( + "airflow.providers.pagerduty.hooks.pagerduty.PagerdutyHook", + "airflow.contrib.hooks.pagerduty_hook.PagerdutyHook", + ), + ( + "airflow.providers.qubole.hooks.qubole_check.QuboleCheckHook", + "airflow.contrib.hooks.qubole_check_hook.QuboleCheckHook", + ), + ( + "airflow.providers.qubole.hooks.qubole.QuboleHook", + "airflow.contrib.hooks.qubole_hook.QuboleHook", + ), + ( + "airflow.providers.salesforce.hooks.salesforce.SalesforceHook", + "airflow.contrib.hooks.salesforce_hook.SalesforceHook", + ), + ( + "airflow.providers.segment.hooks.segment.SegmentHook", + "airflow.contrib.hooks.segment_hook.SegmentHook", + ), + ( + "airflow.providers.slack.hooks.slack_webhook.SlackWebhookHook", + "airflow.contrib.hooks.slack_webhook_hook.SlackWebhookHook", + ), + ( + "airflow.providers.vertica.hooks.vertica.VerticaHook", + "airflow.contrib.hooks.vertica_hook.VerticaHook", + ), + ( + "airflow.providers.slack.hooks.slack.SlackHook", + "airflow.hooks.slack_hook.SlackHook", + ), + ( + "airflow.providers.zendesk.hooks.zendesk.ZendeskHook", + "airflow.hooks.zendesk_hook.ZendeskHook", + ), + ( + "airflow.providers.ftp.hooks.ftp.FTPSHook", + "airflow.contrib.hooks.ftp_hook.FTPSHook", + ), + ( + "airflow.providers.ftp.hooks.ftp.FTPHook", + "airflow.contrib.hooks.ftp_hook.FTPHook", + ), + ( + "airflow.providers.imap.hooks.imap.ImapHook", + "airflow.contrib.hooks.imap_hook.ImapHook", + ), + ( + "airflow.providers.ssh.hooks.ssh.SSHHook", + "airflow.contrib.hooks.ssh_hook.SSHHook", + ), + ( + "airflow.providers.microsoft.winrm.hooks.winrm.WinRMHook", + "airflow.contrib.hooks.winrm_hook.WinRMHook", + ), + ("airflow.providers.http.hooks.http.HttpHook", "airflow.hooks.http_hook.HttpHook"), + ("airflow.providers.jdbc.hooks.jdbc.JdbcHook", "airflow.hooks.jdbc_hook.JdbcHook"), + ( + "airflow.providers.amazon.aws.hooks.base_aws.AwsBaseHook", + "airflow.contrib.hooks.aws_hook.AwsHook", + ), + ( + "airflow.providers.amazon.aws.hooks.aws_dynamodb.AwsDynamoDBHook", + "airflow.contrib.hooks.aws_dynamodb_hook.AwsDynamoDBHook", + ), + ( + "airflow.providers.sftp.hooks.sftp.SFTPHook", + "airflow.contrib.hooks.sftp_hook.SFTPHook", + ), +] + +OPERATORS = [ + ( + "airflow.providers.google.cloud.operators.dataflow.DataflowCreateJavaJobOperator", + "airflow.contrib.operators.dataflow_operator.DataFlowJavaOperator", + ), + ( + "airflow.providers.google.cloud.operators.dataflow.DataflowCreatePythonJobOperator", + "airflow.contrib.operators.dataflow_operator.DataFlowPythonOperator", + ), + ( + "airflow.providers.google.cloud.operators.dataflow.DataflowTemplatedJobStartOperator", + "airflow.contrib.operators.dataflow_operator.DataflowTemplateOperator", + ), + ( + "airflow.providers.google.cloud.operators.datastore.CloudDatastoreExportEntitiesOperator", + "airflow.contrib.operators.datastore_export_operator.DatastoreExportOperator", + ), + ( + "airflow.providers.google.cloud.operators.datastore.CloudDatastoreImportEntitiesOperator", + "airflow.contrib.operators.datastore_import_operator.DatastoreImportOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigtable.BigtableUpdateClusterOperator", + "airflow.contrib.operators.gcp_bigtable_operator.BigtableClusterUpdateOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigtable.BigtableCreateInstanceOperator", + "airflow.contrib.operators.gcp_bigtable_operator.BigtableInstanceCreateOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigtable.BigtableDeleteInstanceOperator", + "airflow.contrib.operators.gcp_bigtable_operator.BigtableInstanceDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigtable.BigtableCreateTableOperator", + "airflow.contrib.operators.gcp_bigtable_operator.BigtableTableCreateOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigtable.BigtableDeleteTableOperator", + "airflow.contrib.operators.gcp_bigtable_operator.BigtableTableDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_build.CloudBuildCreateBuildOperator", + "airflow.contrib.operators.gcp_cloud_build_operator.CloudBuildCreateBuildOperator", + ), + ( + "airflow.providers.google.cloud.operators.compute.ComputeEngineBaseOperator", + "airflow.contrib.operators.gcp_compute_operator.GceBaseOperator", + ), + ( + "airflow.providers.google.cloud.operators.compute" + ".ComputeEngineInstanceGroupUpdateManagerTemplateOperator", + "airflow.contrib.operators.gcp_compute_operator." + "GceInstanceGroupManagerUpdateTemplateOperator", + ), + ( + "airflow.providers.google.cloud.operators.compute.ComputeEngineStartInstanceOperator", + "airflow.contrib.operators.gcp_compute_operator.GceInstanceStartOperator", + ), + ( + "airflow.providers.google.cloud.operators.compute.ComputeEngineStopInstanceOperator", + "airflow.contrib.operators.gcp_compute_operator.GceInstanceStopOperator", + ), + ( + "airflow.providers.google.cloud.operators.compute.ComputeEngineCopyInstanceTemplateOperator", + "airflow.contrib.operators.gcp_compute_operator.GceInstanceTemplateCopyOperator", + ), + ( + "airflow.providers.google.cloud.operators.compute.ComputeEngineSetMachineTypeOperator", + "airflow.contrib.operators.gcp_compute_operator.GceSetMachineTypeOperator", + ), + ( + "airflow.providers.google.cloud.operators.kubernetes_engine.GKECreateClusterOperator", + "airflow.contrib.operators.gcp_container_operator.GKEClusterCreateOperator", + ), + ( + "airflow.providers.google.cloud.operators.kubernetes_engine.GKEDeleteClusterOperator", + "airflow.contrib.operators.gcp_container_operator.GKEClusterDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.kubernetes_engine.GKEStartPodOperator", + "airflow.contrib.operators.gcp_container_operator.GKEPodOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPCancelDLPJobOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPCancelDLPJobOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPCreateDeidentifyTemplateOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPCreateDeidentifyTemplateOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPCreateDLPJobOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPCreateDLPJobOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPCreateInspectTemplateOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPCreateInspectTemplateOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPCreateJobTriggerOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPCreateJobTriggerOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPCreateStoredInfoTypeOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPCreateStoredInfoTypeOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPDeidentifyContentOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPDeidentifyContentOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPDeleteDeidentifyTemplateOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPDeleteDeidentifyTemplateOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPDeleteDLPJobOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPDeleteDlpJobOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPDeleteInspectTemplateOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPDeleteInspectTemplateOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPDeleteJobTriggerOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPDeleteJobTriggerOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPDeleteStoredInfoTypeOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPDeleteStoredInfoTypeOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPGetDeidentifyTemplateOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPGetDeidentifyTemplateOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPGetDLPJobOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPGetDlpJobOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPGetInspectTemplateOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPGetInspectTemplateOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPGetDLPJobTriggerOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPGetJobTripperOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPGetStoredInfoTypeOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPGetStoredInfoTypeOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPInspectContentOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPInspectContentOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPListDeidentifyTemplatesOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPListDeidentifyTemplatesOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPListDLPJobsOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPListDlpJobsOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPListInfoTypesOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPListInfoTypesOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPListInspectTemplatesOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPListInspectTemplatesOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPListJobTriggersOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPListJobTriggersOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPListStoredInfoTypesOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPListStoredInfoTypesOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPRedactImageOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPRedactImageOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPReidentifyContentOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPReidentifyContentOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPUpdateDeidentifyTemplateOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPUpdateDeidentifyTemplateOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPUpdateInspectTemplateOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPUpdateInspectTemplateOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPUpdateJobTriggerOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPUpdateJobTriggerOperator", + ), + ( + "airflow.providers.google.cloud.operators.dlp.CloudDLPUpdateStoredInfoTypeOperator", + "airflow.contrib.operators.gcp_dlp_operator.CloudDLPUpdateStoredInfoTypeOperator", + ), + ( + "airflow.providers.google.cloud.operators.functions.CloudFunctionDeleteFunctionOperator", + "airflow.contrib.operators.gcp_function_operator.GcfFunctionDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.functions.CloudFunctionDeployFunctionOperator", + "airflow.contrib.operators.gcp_function_operator.GcfFunctionDeployOperator", + ), + ( + "airflow.providers.google.cloud.operators.natural_language." + "CloudNaturalLanguageAnalyzeEntitiesOperator", + "airflow.contrib.operators.gcp_natural_language_operator." + "CloudLanguageAnalyzeEntitiesOperator", + ), + ( + "airflow.providers.google.cloud.operators.natural_language." + "CloudNaturalLanguageAnalyzeEntitySentimentOperator", + "airflow.contrib.operators.gcp_natural_language_operator." + "CloudLanguageAnalyzeEntitySentimentOperator", + ), + ( + "airflow.providers.google.cloud.operators.natural_language." + "CloudNaturalLanguageAnalyzeSentimentOperator", + "airflow.contrib.operators.gcp_natural_language_operator." + "CloudLanguageAnalyzeSentimentOperator", + ), + ( + "airflow.providers.google.cloud.operators.natural_language." + "CloudNaturalLanguageClassifyTextOperator", + "airflow.contrib.operators.gcp_natural_language_operator.CloudLanguageClassifyTextOperator", + ), + ( + "airflow.providers.google.cloud.operators.spanner.SpannerDeleteDatabaseInstanceOperator", + "airflow.contrib.operators.gcp_spanner_operator.CloudSpannerInstanceDatabaseDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.spanner.SpannerDeployDatabaseInstanceOperator", + "airflow.contrib.operators.gcp_spanner_operator.CloudSpannerInstanceDatabaseDeployOperator", + ), + ( + "airflow.providers.google.cloud.operators.spanner.SpannerQueryDatabaseInstanceOperator", + "airflow.contrib.operators.gcp_spanner_operator.CloudSpannerInstanceDatabaseQueryOperator", + ), + ( + "airflow.providers.google.cloud.operators.spanner.SpannerUpdateDatabaseInstanceOperator", + "airflow.contrib.operators.gcp_spanner_operator.CloudSpannerInstanceDatabaseUpdateOperator", + ), + ( + "airflow.providers.google.cloud.operators.spanner.SpannerDeleteInstanceOperator", + "airflow.contrib.operators.gcp_spanner_operator.CloudSpannerInstanceDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.spanner.SpannerDeployInstanceOperator", + "airflow.contrib.operators.gcp_spanner_operator.CloudSpannerInstanceDeployOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_storage_transfer_service" + ".CloudDataTransferServiceCreateJobOperator", + "airflow.contrib.operators.gcp_transfer_operator.GcpTransferServiceJobCreateOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_storage_transfer_service" + ".CloudDataTransferServiceDeleteJobOperator", + "airflow.contrib.operators.gcp_transfer_operator.GcpTransferServiceJobDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_storage_transfer_service" + ".CloudDataTransferServiceUpdateJobOperator", + "airflow.contrib.operators.gcp_transfer_operator.GcpTransferServiceJobUpdateOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_storage_transfer_service." + "CloudDataTransferServiceCancelOperationOperator", + "airflow.contrib.operators.gcp_transfer_operator." + "GcpTransferServiceOperationCancelOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_storage_transfer_service." + "CloudDataTransferServiceGetOperationOperator", + "airflow.contrib.operators.gcp_transfer_operator." + "GcpTransferServiceOperationGetOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_storage_transfer_service." + "CloudDataTransferServicePauseOperationOperator", + "airflow.contrib.operators.gcp_transfer_operator." + "GcpTransferServiceOperationPauseOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_storage_transfer_service." + "CloudDataTransferServiceResumeOperationOperator", + "airflow.contrib.operators.gcp_transfer_operator." + "GcpTransferServiceOperationResumeOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_storage_transfer_service." + "CloudDataTransferServiceListOperationsOperator", + "airflow.contrib.operators.gcp_transfer_operator." + "GcpTransferServiceOperationsListOperator", + ), + ( + "airflow.providers.google.cloud.operators.translate.CloudTranslateTextOperator", + "airflow.contrib.operators.gcp_translate_operator.CloudTranslateTextOperator", + ), + ( + "airflow.providers.google.cloud.operators.translate_speech.CloudTranslateSpeechOperator", + "airflow.contrib.operators.gcp_translate_speech_operator.CloudTranslateSpeechOperator", + ), + ( + "airflow.providers.google.cloud.operators.video_intelligence." + "CloudVideoIntelligenceDetectVideoExplicitContentOperator", + "airflow.contrib.operators.gcp_video_intelligence_operator." + "CloudVideoIntelligenceDetectVideoExplicitContentOperator", + ), + ( + "airflow.providers.google.cloud.operators.video_intelligence." + "CloudVideoIntelligenceDetectVideoLabelsOperator", + "airflow.contrib.operators.gcp_video_intelligence_operator." + "CloudVideoIntelligenceDetectVideoLabelsOperator", + ), + ( + "airflow.providers.google.cloud.operators.video_intelligence." + "CloudVideoIntelligenceDetectVideoShotsOperator", + "airflow.contrib.operators.gcp_video_intelligence_operator." + "CloudVideoIntelligenceDetectVideoShotsOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionImageAnnotateOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionAnnotateImageOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionTextDetectOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionDetectDocumentTextOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionDetectImageLabelsOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionDetectImageLabelsOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionDetectImageSafeSearchOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionDetectImageSafeSearchOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionDetectTextOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionDetectTextOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionCreateProductOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionProductCreateOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionDeleteProductOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionProductDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionGetProductOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionProductGetOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionCreateProductSetOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionProductSetCreateOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionDeleteProductSetOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionProductSetDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionGetProductSetOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionProductSetGetOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionUpdateProductSetOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionProductSetUpdateOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionUpdateProductOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionProductUpdateOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionCreateReferenceImageOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionReferenceImageCreateOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionRemoveProductFromProductSetOperator", + "airflow.contrib.operators.gcp_vision_operator." + "CloudVisionRemoveProductFromProductSetOperator", + ), + ( + "airflow.providers.google.cloud.operators.mlengine.MLEngineStartBatchPredictionJobOperator", + "airflow.contrib.operators.mlengine_operator.MLEngineBatchPredictionOperator", + ), + ( + "airflow.providers.google.cloud.operators.mlengine.MLEngineManageModelOperator", + "airflow.contrib.operators.mlengine_operator.MLEngineModelOperator", + ), + ( + "airflow.providers.google.cloud.operators.mlengine.MLEngineStartTrainingJobOperator", + "airflow.contrib.operators.mlengine_operator.MLEngineTrainingOperator", + ), + ( + "airflow.providers.google.cloud.operators.mlengine.MLEngineManageVersionOperator", + "airflow.contrib.operators.mlengine_operator.MLEngineVersionOperator", + ), + ( + "airflow.providers.google.cloud.operators.pubsub.PubSubPublishMessageOperator", + "airflow.contrib.operators.pubsub_operator.PubSubPublishOperator", + ), + ( + "airflow.providers.google.cloud.operators.pubsub.PubSubCreateSubscriptionOperator", + "airflow.contrib.operators.pubsub_operator.PubSubSubscriptionCreateOperator", + ), + ( + "airflow.providers.google.cloud.operators.pubsub.PubSubDeleteSubscriptionOperator", + "airflow.contrib.operators.pubsub_operator.PubSubSubscriptionDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.pubsub.PubSubCreateTopicOperator", + "airflow.contrib.operators.pubsub_operator.PubSubTopicCreateOperator", + ), + ( + "airflow.providers.google.cloud.operators.pubsub.PubSubDeleteTopicOperator", + "airflow.contrib.operators.pubsub_operator.PubSubTopicDeleteOperator", + ), + ( + "airflow.providers.google.cloud." + "operators.dataproc.DataprocCreateClusterOperator", + "airflow.contrib.operators.dataproc_operator.DataprocClusterCreateOperator", + ), + ( + "airflow.providers.google.cloud." + "operators.dataproc.DataprocDeleteClusterOperator", + "airflow.contrib.operators.dataproc_operator.DataprocClusterDeleteOperator", + ), + ( + "airflow.providers.google.cloud." + "operators.dataproc.DataprocScaleClusterOperator", + "airflow.contrib.operators.dataproc_operator.DataprocClusterScaleOperator", + ), + ( + "airflow.providers.google.cloud." + "operators.dataproc.DataprocSubmitHadoopJobOperator", + "airflow.contrib.operators.dataproc_operator.DataProcHadoopOperator", + ), + ( + "airflow.providers.google.cloud." + "operators.dataproc.DataprocSubmitHiveJobOperator", + "airflow.contrib.operators.dataproc_operator.DataProcHiveOperator", + ), + ( + "airflow.providers.google.cloud." "operators.dataproc.DataprocJobBaseOperator", + "airflow.contrib.operators.dataproc_operator.DataProcJobBaseOperator", + ), + ( + "airflow.providers.google.cloud." + "operators.dataproc.DataprocSubmitPigJobOperator", + "airflow.contrib.operators.dataproc_operator.DataProcPigOperator", + ), + ( + "airflow.providers.google.cloud." + "operators.dataproc.DataprocSubmitPySparkJobOperator", + "airflow.contrib.operators.dataproc_operator.DataProcPySparkOperator", + ), + ( + "airflow.providers.google.cloud." + "operators.dataproc.DataprocSubmitSparkJobOperator", + "airflow.contrib.operators.dataproc_operator.DataProcSparkOperator", + ), + ( + "airflow.providers.google.cloud." + "operators.dataproc.DataprocSubmitSparkSqlJobOperator", + "airflow.contrib.operators.dataproc_operator.DataProcSparkSqlOperator", + ), + ( + "airflow.providers.google.cloud." + "operators.dataproc.DataprocInstantiateInlineWorkflowTemplateOperator", + "airflow.contrib.operators.dataproc_operator." + "DataprocWorkflowTemplateInstantiateInlineOperator", + ), + ( + "airflow.providers.google.cloud." + "operators.dataproc.DataprocInstantiateWorkflowTemplateOperator", + "airflow.contrib.operators.dataproc_operator." + "DataprocWorkflowTemplateInstantiateOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryCreateEmptyDatasetOperator", + "airflow.contrib.operators.bigquery_operator.BigQueryCreateEmptyDatasetOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryCreateEmptyTableOperator", + "airflow.contrib.operators.bigquery_operator.BigQueryCreateEmptyTableOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryCreateExternalTableOperator", + "airflow.contrib.operators.bigquery_operator.BigQueryCreateExternalTableOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryDeleteDatasetOperator", + "airflow.contrib.operators.bigquery_operator.BigQueryDeleteDatasetOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryGetDatasetOperator", + "airflow.contrib.operators.bigquery_operator.BigQueryGetDatasetOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryGetDatasetTablesOperator", + "airflow.contrib.operators.bigquery_operator.BigQueryGetDatasetTablesOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryPatchDatasetOperator", + "airflow.contrib.operators.bigquery_operator.BigQueryPatchDatasetOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryUpdateDatasetOperator", + "airflow.contrib.operators.bigquery_operator.BigQueryUpdateDatasetOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryUpsertTableOperator", + "airflow.contrib.operators.bigquery_operator.BigQueryUpsertTableOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryCheckOperator", + "airflow.contrib.operators.bigquery_check_operator.BigQueryCheckOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryIntervalCheckOperator", + "airflow.contrib.operators.bigquery_check_operator.BigQueryIntervalCheckOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryValueCheckOperator", + "airflow.contrib.operators.bigquery_check_operator.BigQueryValueCheckOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryGetDataOperator", + "airflow.contrib.operators.bigquery_get_data.BigQueryGetDataOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryExecuteQueryOperator", + "airflow.contrib.operators.bigquery_operator.BigQueryOperator", + ), + ( + "airflow.providers.google.cloud.operators.bigquery.BigQueryDeleteTableOperator", + "airflow.contrib.operators.bigquery_table_delete_operator.BigQueryTableDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.gcs.GCSBucketCreateAclEntryOperator", + "airflow.contrib.operators.gcs_acl_operator.GoogleCloudStorageBucketCreateAclEntryOperator", + ), + ( + "airflow.providers.google.cloud.operators.gcs.GCSObjectCreateAclEntryOperator", + "airflow.contrib.operators.gcs_acl_operator.GoogleCloudStorageObjectCreateAclEntryOperator", + ), + ( + "airflow.providers.google.cloud.operators.gcs.GCSDeleteObjectsOperator", + "airflow.contrib.operators.gcs_delete_operator.GoogleCloudStorageDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.gcs.GCSListObjectsOperator", + "airflow.contrib.operators.gcs_list_operator.GoogleCloudStorageListOperator", + ), + ( + "airflow.providers.google.cloud.operators.gcs.GCSCreateBucketOperator", + "airflow.contrib.operators.gcs_operator.GoogleCloudStorageCreateBucketOperator", + ), + ( + "airflow.providers.amazon.aws.operators.athena.AWSAthenaOperator", + "airflow.contrib.operators.aws_athena_operator.AWSAthenaOperator", + ), + ( + "airflow.providers.amazon.aws.operators.batch.AwsBatchOperator", + "airflow.contrib.operators.awsbatch_operator.AWSBatchOperator", + ), + ( + "airflow.providers.amazon.aws.operators.sqs.SQSPublishOperator", + "airflow.contrib.operators.aws_sqs_publish_operator.SQSPublishOperator", + ), + ( + "airflow.providers.amazon.aws.operators.sns.SnsPublishOperator", + "airflow.contrib.operators.sns_publish_operator.SnsPublishOperator", + ), + ( + "airflow.providers.apache.druid.operators.druid.DruidOperator", + "airflow.contrib.operators.druid_operator.DruidOperator", + ), + ( + "airflow.providers.apache.spark.operators.spark_jdbc.SparkSubmitOperator", + "airflow.contrib.operators.spark_jdbc_operator.SparkSubmitOperator", + ), + ( + "airflow.providers.apache.spark.operators.spark_sql.SparkSqlOperator", + "airflow.contrib.operators.spark_sql_operator.SparkSqlOperator", + ), + ( + "airflow.providers.apache.spark.operators.spark_submit.SparkSubmitOperator", + "airflow.contrib.operators.spark_submit_operator.SparkSubmitOperator", + ), + ( + "airflow.providers.apache.spark.operators.spark_jdbc.SparkJDBCOperator", + "airflow.contrib.operators.spark_jdbc_operator.SparkJDBCOperator", + ), + ( + "airflow.providers.apache.sqoop.operators.sqoop.SqoopOperator", + "airflow.contrib.operators.sqoop_operator.SqoopOperator", + ), + ( + "airflow.providers.apache.druid.operators.druid_check.DruidCheckOperator", + "airflow.operators.druid_check_operator.DruidCheckOperator", + ), + ( + "airflow.providers.apache.hive.operators.hive.HiveOperator", + "airflow.operators.hive_operator.HiveOperator", + ), + ( + "airflow.providers.apache.hive.operators.hive_stats.HiveStatsCollectionOperator", + "airflow.operators.hive_stats_operator.HiveStatsCollectionOperator", + ), + ( + "airflow.providers.apache.pig.operators.pig.PigOperator", + "airflow.operators.pig_operator.PigOperator", + ), + ( + "airflow.providers.microsoft.azure.operators.adls_list.AzureDataLakeStorageListOperator", + "airflow.contrib.operators.adls_list_operator.AzureDataLakeStorageListOperator", + ), + ( + "airflow.providers.microsoft.azure.operators" + ".azure_container_instances.AzureContainerInstancesOperator", + "airflow.contrib.operators.azure_container_instances_operator.AzureContainerInstancesOperator", + ), + ( + "airflow.providers.microsoft.azure.operators.azure_cosmos.AzureCosmosInsertDocumentOperator", + "airflow.contrib.operators.azure_cosmos_operator.AzureCosmosInsertDocumentOperator", + ), + ( + "airflow.providers.microsoft.azure.operators.wasb_delete_blob.WasbDeleteBlobOperator", + "airflow.contrib.operators.wasb_delete_blob_operator.WasbDeleteBlobOperator", + ), + ( + "airflow.providers.amazon.aws.operators.ecs.ECSOperator", + "airflow.contrib.operators.ecs_operator.ECSOperator", + ), + ( + "airflow.providers.amazon.aws.operators.emr_add_steps.EmrAddStepsOperator", + "airflow.contrib.operators.emr_add_steps_operator.EmrAddStepsOperator", + ), + ( + "airflow.providers.amazon.aws.operators.emr_create_job_flow.EmrCreateJobFlowOperator", + "airflow.contrib.operators.emr_create_job_flow_operator.EmrCreateJobFlowOperator", + ), + ( + "airflow.providers.amazon.aws.operators.emr_terminate_job_flow.EmrTerminateJobFlowOperator", + "airflow.contrib.operators.emr_terminate_job_flow_operator.EmrTerminateJobFlowOperator", + ), + ( + "airflow.providers.amazon.aws.operators.s3_copy_object.S3CopyObjectOperator", + "airflow.contrib.operators.s3_copy_object_operator.S3CopyObjectOperator", + ), + ( + "airflow.providers.amazon.aws.operators.s3_delete_objects.S3DeleteObjectsOperator", + "airflow.contrib.operators.s3_delete_objects_operator.S3DeleteObjectsOperator", + ), + ( + "airflow.providers.amazon.aws.operators.s3_list.S3ListOperator", + "airflow.contrib.operators.s3_list_operator.S3ListOperator", + ), + ( + "airflow.providers.amazon.aws.operators.sagemaker_base.SageMakerBaseOperator", + "airflow.contrib.operators.sagemaker_base_operator.SageMakerBaseOperator", + ), + ( + "airflow.providers.amazon.aws.operators.sagemaker_endpoint_config.SageMakerEndpointConfigOperator", + "airflow.contrib.operators.sagemaker_endpoint_config_operator.SageMakerEndpointConfigOperator", + ), + ( + "airflow.providers.amazon.aws.operators.sagemaker_endpoint.SageMakerEndpointOperator", + "airflow.contrib.operators.sagemaker_endpoint_operator.SageMakerEndpointOperator", + ), + ( + "airflow.providers.amazon.aws.operators.sagemaker_model.SageMakerModelOperator", + "airflow.contrib.operators.sagemaker_model_operator.SageMakerModelOperator", + ), + ( + "airflow.providers.amazon.aws.operators.sagemaker_training.SageMakerTrainingOperator", + "airflow.contrib.operators.sagemaker_training_operator.SageMakerTrainingOperator", + ), + ( + "airflow.providers.amazon.aws.operators.sagemaker_transform.SageMakerTransformOperator", + "airflow.contrib.operators.sagemaker_transform_operator.SageMakerTransformOperator", + ), + ( + "airflow.providers.amazon.aws.operators.sagemaker_tuning.SageMakerTuningOperator", + "airflow.contrib.operators.sagemaker_tuning_operator.SageMakerTuningOperator", + ), + ( + "airflow.providers.docker.operators.docker_swarm.DockerSwarmOperator", + "airflow.contrib.operators.docker_swarm_operator.DockerSwarmOperator", + ), + ( + "airflow.providers.cncf.kubernetes.operators.kubernetes_pod.KubernetesPodOperator", + "airflow.contrib.operators.kubernetes_pod_operator.KubernetesPodOperator", + ), + ( + "airflow.providers.redis.operators.redis_publish.RedisPublishOperator", + "airflow.contrib.operators.redis_publish_operator.RedisPublishOperator", + ), + ( + "airflow.operators.bash.BashOperator", + "airflow.operators.bash_operator.BashOperator", + ), + ( + "airflow.providers.docker.operators.docker.DockerOperator", + "airflow.operators.docker_operator.DockerOperator", + ), + ( + "airflow.providers.microsoft.mssql.operators.mssql.MsSqlOperator", + "airflow.operators.mssql_operator.MsSqlOperator", + ), + ( + "airflow.providers.mysql.operators.mysql.MySqlOperator", + "airflow.operators.mysql_operator.MySqlOperator", + ), + ( + "airflow.providers.oracle.operators.oracle.OracleOperator", + "airflow.operators.oracle_operator.OracleOperator", + ), + ( + "airflow.providers.papermill.operators.papermill.PapermillOperator", + "airflow.operators.papermill_operator.PapermillOperator", + ), + ( + "airflow.operators.sql.SQLCheckOperator", + "airflow.operators.presto_check_operator.PrestoCheckOperator", + ), + ( + "airflow.operators.sql.SQLIntervalCheckOperator", + "airflow.operators.presto_check_operator.PrestoIntervalCheckOperator", + ), + ( + "airflow.operators.sql.SQLValueCheckOperator", + "airflow.operators.presto_check_operator.PrestoValueCheckOperator", + ), + ( + "airflow.operators.sql.SQLCheckOperator", + "airflow.operators.check_operator.CheckOperator", + ), + ( + "airflow.operators.sql.SQLIntervalCheckOperator", + "airflow.operators.check_operator.IntervalCheckOperator", + ), + ( + "airflow.operators.sql.SQLValueCheckOperator", + "airflow.operators.check_operator.ValueCheckOperator", + ), + ( + "airflow.operators.sql.SQLThresholdCheckOperator", + "airflow.operators.check_operator.ThresholdCheckOperator", + ), + ( + "airflow.operators.sql.BranchSQLOperator", + "airflow.operators.sql_branch_operator.BranchSqlOperator", + ), + ( + "airflow.operators.python.BranchPythonOperator", + "airflow.operators.python_operator.BranchPythonOperator", + ), + ( + "airflow.operators.python.PythonOperator", + "airflow.operators.python_operator.PythonOperator", + ), + ( + "airflow.operators.python.ShortCircuitOperator", + "airflow.operators.python_operator.ShortCircuitOperator", + ), + ( + "airflow.operators.python.PythonVirtualenvOperator", + "airflow.operators.python_operator.PythonVirtualenvOperator", + ), + ( + "airflow.providers.sqlite.operators.sqlite.SqliteOperator", + "airflow.operators.sqlite_operator.SqliteOperator", + ), + ( + "airflow.providers.databricks.operators.databricks.DatabricksRunNowOperator", + "airflow.contrib.operators.databricks_operator.DatabricksRunNowOperator", + ), + ( + "airflow.providers.databricks.operators.databricks.DatabricksSubmitRunOperator", + "airflow.contrib.operators.databricks_operator.DatabricksSubmitRunOperator", + ), + ( + "airflow.providers.dingding.operators.dingding.DingdingOperator", + "airflow.contrib.operators.dingding_operator.DingdingOperator", + ), + ( + "airflow.providers.discord.operators.discord_webhook.DiscordWebhookOperator", + "airflow.contrib.operators.discord_webhook_operator.DiscordWebhookOperator", + ), + ( + "airflow.providers.jenkins.operators.jenkins_job_trigger.JenkinsJobTriggerOperator", + "airflow.contrib.operators.jenkins_job_trigger_operator.JenkinsJobTriggerOperator", + ), + ( + "airflow.providers.opsgenie.operators.opsgenie_alert.OpsgenieAlertOperator", + "airflow.contrib.operators.opsgenie_alert_operator.OpsgenieAlertOperator", + ), + ( + "airflow.providers.qubole.operators.qubole_check.QuboleCheckOperator", + "airflow.contrib.operators.qubole_check_operator.QuboleCheckOperator", + ), + ( + "airflow.providers.qubole.operators.qubole_check.QuboleValueCheckOperator", + "airflow.contrib.operators.qubole_check_operator.QuboleValueCheckOperator", + ), + ( + "airflow.providers.qubole.operators.qubole.QuboleOperator", + "airflow.contrib.operators.qubole_operator.QuboleOperator", + ), + ( + "airflow.providers.segment.operators.segment_track_event.SegmentTrackEventOperator", + "airflow.contrib.operators.segment_track_event_operator.SegmentTrackEventOperator", + ), + ( + "airflow.providers.slack.operators.slack_webhook.SlackWebhookOperator", + "airflow.contrib.operators.slack_webhook_operator.SlackWebhookOperator", + ), + ( + "airflow.providers.vertica.operators.vertica.VerticaOperator", + "airflow.contrib.operators.vertica_operator.VerticaOperator", + ), + ( + "airflow.providers.slack.operators.slack.SlackAPIPostOperator", + "airflow.operators.slack_operator.SlackAPIPostOperator", + ), + ( + "airflow.providers.slack.operators.slack.SlackAPIOperator", + "airflow.operators.slack_operator.SlackAPIOperator", + ), + ( + "airflow.providers.grpc.operators.grpc.GrpcOperator", + "airflow.contrib.operators.grpc_operator.GrpcOperator", + ), + ( + "airflow.providers.ssh.operators.ssh.SSHOperator", + "airflow.contrib.operators.ssh_operator.SSHOperator", + ), + ( + "airflow.providers.microsoft.winrm.operators.winrm.WinRMOperator", + "airflow.contrib.operators.winrm_operator.WinRMOperator", + ), + ( + "airflow.operators.email.EmailOperator", + "airflow.operators.email_operator.EmailOperator", + ), + ( + "airflow.providers.http.operators.http.SimpleHttpOperator", + "airflow.operators.http_operator.SimpleHttpOperator", + ), + ( + "airflow.providers.jdbc.operators.jdbc.JdbcOperator", + "airflow.operators.jdbc_operator.JdbcOperator", + ), + ( + "airflow.providers.sftp.operators.sftp.SFTPOperator", + "airflow.contrib.operators.sftp_operator.SFTPOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_sql.CloudSQLBaseOperator", + "airflow.contrib.operators.gcp_sql_operator.CloudSqlBaseOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_sql.CloudSQLCreateInstanceDatabaseOperator", + "airflow.contrib.operators.gcp_sql_operator.CloudSqlInstanceDatabaseCreateOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_sql.CloudSQLCreateInstanceOperator", + "airflow.contrib.operators.gcp_sql_operator.CloudSqlInstanceCreateOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_sql.CloudSQLDeleteInstanceDatabaseOperator", + "airflow.contrib.operators.gcp_sql_operator.CloudSqlInstanceDatabaseDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_sql.CloudSQLDeleteInstanceOperator", + "airflow.contrib.operators.gcp_sql_operator.CloudSqlInstanceDeleteOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_sql.CloudSQLExecuteQueryOperator", + "airflow.contrib.operators.gcp_sql_operator.CloudSqlQueryOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_sql.CloudSQLExportInstanceOperator", + "airflow.contrib.operators.gcp_sql_operator.CloudSqlInstanceExportOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_sql.CloudSQLImportInstanceOperator", + "airflow.contrib.operators.gcp_sql_operator.CloudSqlInstanceImportOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_sql.CloudSQLInstancePatchOperator", + "airflow.contrib.operators.gcp_sql_operator.CloudSqlInstancePatchOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_sql.CloudSQLPatchInstanceDatabaseOperator", + "airflow.contrib.operators.gcp_sql_operator.CloudSqlInstanceDatabasePatchOperator", + ), + ( + "airflow.providers.jira.operators.jira.JiraOperator", + "airflow.contrib.operators.jira_operator.JiraOperator", + ), + ( + "airflow.providers.postgres.operators.postgres.PostgresOperator", + "airflow.operators.postgres_operator.PostgresOperator", + ), + ( + "airflow.providers.google.cloud.operators.speech_to_text.CloudSpeechToTextRecognizeSpeechOperator", + "airflow.contrib.operators.gcp_speech_to_text_operator.GcpSpeechToTextRecognizeSpeechOperator", + ), + ( + "airflow.providers.google.cloud.operators.text_to_speech.CloudTextToSpeechSynthesizeOperator", + "airflow.contrib.operators.gcp_text_to_speech_operator.GcpTextToSpeechSynthesizeOperator", + ), +] + +SECRETS = [ + ( + "airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend", + "airflow.contrib.secrets.aws_secrets_manager.SecretsManagerBackend", + ), + ( + "airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend", + "airflow.contrib.secrets.aws_systems_manager.SystemsManagerParameterStoreBackend", + ), + ( + "airflow.providers.google.cloud.secrets.secret_manager.CloudSecretManagerBackend", + "airflow.contrib.secrets.gcp_secrets_manager.CloudSecretsManagerBackend", + ), + ( + "airflow.providers.hashicorp.secrets.vault.VaultBackend", + "airflow.contrib.secrets.hashicorp_vault.VaultBackend", + ), +] + +SENSORS = [ + ( + "airflow.providers.apache.cassandra.sensors.record.CassandraRecordSensor", + "airflow.contrib.sensors.cassandra_record_sensor.CassandraRecordSensor", + ), + ( + "airflow.providers.apache.cassandra.sensors.table.CassandraTableSensor", + "airflow.contrib.sensors.cassandra_table_sensor.CassandraTableSensor", + ), + ( + "airflow.providers.datadog.sensors.datadog.DatadogSensor", + "airflow.contrib.sensors.datadog_sensor.DatadogSensor", + ), + ( + "airflow.providers.google.cloud.sensors.bigtable.BigtableTableReplicationCompletedSensor", + "airflow.contrib.operators.gcp_bigtable_operator." + "BigtableTableWaitForReplicationSensor", + ), + ( + "airflow.providers.google.cloud.sensors.cloud_storage_transfer_service." + "CloudDataTransferServiceJobStatusSensor", + "airflow.contrib.sensors.gcp_transfer_sensor." + "GCPTransferServiceWaitForJobStatusSensor", + ), + ( + "airflow.providers.google.cloud.sensors.pubsub.PubSubPullSensor", + "airflow.contrib.sensors.pubsub_sensor.PubSubPullSensor", + ), + ( + "airflow.providers.google.cloud.sensors.bigquery.BigQueryTableExistenceSensor", + "airflow.contrib.sensors.bigquery_sensor.BigQueryTableSensor", + ), + ( + "airflow.providers.google.cloud.sensors.gcs.GCSObjectExistenceSensor", + "airflow.contrib.sensors.gcs_sensor.GoogleCloudStorageObjectSensor", + ), + ( + "airflow.providers.google.cloud.sensors.gcs.GCSObjectUpdateSensor", + "airflow.contrib.sensors.gcs_sensor.GoogleCloudStorageObjectUpdatedSensor", + ), + ( + "airflow.providers.google.cloud.sensors.gcs.GCSObjectsWtihPrefixExistenceSensor", + "airflow.contrib.sensors.gcs_sensor.GoogleCloudStoragePrefixSensor", + ), + ( + "airflow.providers.google.cloud.sensors.gcs.GCSUploadSessionCompleteSensor", + "airflow.contrib.sensors.gcs_sensor.GoogleCloudStorageUploadSessionCompleteSensor", + ), + ( + "airflow.providers.amazon.aws.sensors.athena.AthenaSensor", + "airflow.contrib.sensors.aws_athena_sensor.AthenaSensor", + ), + ( + "airflow.providers.amazon.aws.sensors.sqs.SQSSensor", + "airflow.contrib.sensors.aws_sqs_sensor.SQSSensor", + ), + ( + "airflow.providers.apache.hdfs.sensors.hdfs.HdfsFolderSensor", + "airflow.contrib.sensors.hdfs_sensor.HdfsSensorFolder", + ), + ( + "airflow.providers.apache.hdfs.sensors.hdfs.HdfsRegexSensor", + "airflow.contrib.sensors.hdfs_sensor.HdfsSensorRegex", + ), + ( + "airflow.providers.apache.hive.sensors.hive_partition.HivePartitionSensor", + "airflow.sensors.hive_partition_sensor.HivePartitionSensor", + ), + ( + "airflow.providers.apache.hive.sensors.metastore_partition.MetastorePartitionSensor", + "airflow.sensors.metastore_partition_sensor.MetastorePartitionSensor", + ), + ( + "airflow.providers.apache.hive.sensors.named_hive_partition.NamedHivePartitionSensor", + "airflow.sensors.named_hive_partition_sensor.NamedHivePartitionSensor", + ), + ( + "airflow.providers.apache.hdfs.sensors.web_hdfs.WebHdfsSensor", + "airflow.sensors.web_hdfs_sensor.WebHdfsSensor", + ), + ( + "airflow.providers.apache.hdfs.sensors.hdfs.HdfsSensor", + "airflow.sensors.hdfs_sensor.HdfsSensor", + ), + ( + "airflow.sensors.weekday_sensor.DayOfWeekSensor", + "airflow.contrib.sensors.weekday_sensor.DayOfWeekSensor", + ), + ( + "airflow.sensors.filesystem.FileSensor", + "airflow.contrib.sensors.file_sensor.FileSensor", + ), + ( + "airflow.providers.microsoft.azure.sensors.wasb.WasbBlobSensor", + "airflow.contrib.sensors.wasb_sensor.WasbBlobSensor", + ), + ( + "airflow.providers.microsoft.azure.sensors.wasb.WasbPrefixSensor", + "airflow.contrib.sensors.wasb_sensor.WasbPrefixSensor", + ), + ( + "airflow.providers.amazon.aws.sensors.glue_catalog_partition.AwsGlueCatalogPartitionSensor", + "airflow.contrib.sensors.aws_glue_catalog_partition_sensor.AwsGlueCatalogPartitionSensor", + ), + ( + "airflow.providers.amazon.aws.sensors.emr_base.EmrBaseSensor", + "airflow.contrib.sensors.emr_base_sensor.EmrBaseSensor", + ), + ( + "airflow.providers.amazon.aws.sensors.emr_job_flow.EmrJobFlowSensor", + "airflow.contrib.sensors.emr_job_flow_sensor.EmrJobFlowSensor", + ), + ( + "airflow.providers.amazon.aws.sensors.emr_step.EmrStepSensor", + "airflow.contrib.sensors.emr_step_sensor.EmrStepSensor", + ), + ( + "airflow.providers.amazon.aws.sensors.sagemaker_base.SageMakerBaseSensor", + "airflow.contrib.sensors.sagemaker_base_sensor.SageMakerBaseSensor", + ), + ( + "airflow.providers.amazon.aws.sensors.sagemaker_endpoint.SageMakerEndpointSensor", + "airflow.contrib.sensors.sagemaker_endpoint_sensor.SageMakerEndpointSensor", + ), + ( + "airflow.providers.amazon.aws.sensors.sagemaker_transform.SageMakerTransformSensor", + "airflow.contrib.sensors.sagemaker_transform_sensor.SageMakerTransformSensor", + ), + ( + "airflow.providers.amazon.aws.sensors.sagemaker_tuning.SageMakerTuningSensor", + "airflow.contrib.sensors.sagemaker_tuning_sensor.SageMakerTuningSensor", + ), + ( + "airflow.providers.amazon.aws.operators.s3_file_transform.S3FileTransformOperator", + "airflow.operators.s3_file_transform_operator.S3FileTransformOperator", + ), + ( + "airflow.providers.amazon.aws.sensors.s3_key.S3KeySensor", + "airflow.sensors.s3_key_sensor.S3KeySensor", + ), + ( + "airflow.providers.amazon.aws.sensors.s3_prefix.S3PrefixSensor", + "airflow.sensors.s3_prefix_sensor.S3PrefixSensor", + ), + ( + "airflow.sensors.bash.BashSensor", + "airflow.contrib.sensors.bash_sensor.BashSensor", + ), + ( + "airflow.providers.celery.sensors.celery_queue.CeleryQueueSensor", + "airflow.contrib.sensors.celery_queue_sensor.CeleryQueueSensor", + ), + ( + "airflow.providers.mongo.sensors.mongo.MongoSensor", + "airflow.contrib.sensors.mongo_sensor.MongoSensor", + ), + ( + "airflow.sensors.python.PythonSensor", + "airflow.contrib.sensors.python_sensor.PythonSensor", + ), + ( + "airflow.providers.redis.sensors.redis_key.RedisKeySensor", + "airflow.contrib.sensors.redis_key_sensor.RedisKeySensor", + ), + ( + "airflow.providers.redis.sensors.redis_pub_sub.RedisPubSubSensor", + "airflow.contrib.sensors.redis_pub_sub_sensor.RedisPubSubSensor", + ), + ( + "airflow.providers.datadog.sensors.datadog.DatadogSensor", + "airflow.contrib.sensors.datadog_sensor.DatadogSensor", + ), + ( + "airflow.providers.qubole.sensors.qubole.QuboleSensor", + "airflow.contrib.sensors.qubole_sensor.QuboleSensor", + ), + ( + "airflow.providers.qubole.sensors.qubole.QubolePartitionSensor", + "airflow.contrib.sensors.qubole_sensor.QubolePartitionSensor", + ), + ( + "airflow.providers.qubole.sensors.qubole.QuboleFileSensor", + "airflow.contrib.sensors.qubole_sensor.QuboleFileSensor", + ), + ( + "airflow.providers.ftp.sensors.ftp.FTPSensor", + "airflow.contrib.sensors.ftp_sensor.FTPSensor", + ), + ( + "airflow.providers.ftp.sensors.ftp.FTPSSensor", + "airflow.contrib.sensors.ftp_sensor.FTPSSensor", + ), + ( + "airflow.providers.imap.sensors.imap_attachment.ImapAttachmentSensor", + "airflow.contrib.sensors.imap_attachment_sensor.ImapAttachmentSensor", + ), + ( + "airflow.providers.jira.sensors.jira.JiraSensor", + "airflow.contrib.sensors.jira_sensor.JiraSensor", + ), + ( + "airflow.providers.jira.sensors.jira.JiraTicketSensor", + "airflow.contrib.sensors.jira_sensor.JiraTicketSensor", + ), + ( + "airflow.providers.http.sensors.http.HttpSensor", + "airflow.sensors.http_sensor.HttpSensor", + ), + ( + "airflow.providers.sftp.sensors.sftp.SFTPSensor", + "airflow.contrib.sensors.sftp_sensor.SFTPSensor", + ), +] + +TRANSFERS = [ + ( + "airflow.providers.google.cloud.transfers.local_to_gcs.LocalFilesystemToGCSOperator", + "airflow.contrib.operators.file_to_gcs.FileToGoogleCloudStorageOperator", + ), + ( + "airflow.providers.google.cloud.transfers.adls_to_gcs.ADLSToGCSOperator", + "airflow.contrib.operators.adls_to_gcs.AdlsToGoogleCloudStorageOperator", + ), + ( + "airflow.providers.google.cloud.operators.cloud_storage_transfer_service." + "CloudDataTransferServiceGCSToGCSOperator", + "airflow.contrib.operators.gcp_transfer_operator." + "GoogleCloudStorageToGoogleCloudStorageTransferOperator", + ), + ( + "airflow.providers.google.cloud.operators.vision.CloudVisionAddProductToProductSetOperator", + "airflow.contrib.operators.gcp_vision_operator.CloudVisionAddProductToProductSetOperator", + ), + ( + "airflow.providers.google.cloud.transfers.gcs_to_bigquery.GCSToBigQueryOperator", + "airflow.contrib.operators.gcs_to_bq.GoogleCloudStorageToBigQueryOperator", + ), + ( + "airflow.providers.google.cloud.transfers.gcs_to_gcs.GCSToGCSOperator", + "airflow.contrib.operators.gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator", + ), + ( + "airflow.providers.amazon.aws.transfers.gcs_to_s3.GCSToS3Operator", + "airflow.contrib.operators.gcs_to_s3.GoogleCloudStorageToS3Operator", + ), + ( + "airflow.providers.google.cloud.transfers.mssql_to_gcs.MSSQLToGCSOperator", + "airflow.contrib.operators.mssql_to_gcs.MsSqlToGoogleCloudStorageOperator", + ), + ( + "airflow.providers.google.cloud.transfers.mysql_to_gcs.MySQLToGCSOperator", + "airflow.contrib.operators.mysql_to_gcs.MySqlToGoogleCloudStorageOperator", + ), + ( + "airflow.providers.google.cloud.transfers.postgres_to_gcs.PostgresToGCSOperator", + "airflow.contrib.operators.postgres_to_gcs_operator." + "PostgresToGoogleCloudStorageOperator", + ), + ( + "airflow.providers.google.cloud.transfers.bigquery_to_bigquery.BigQueryToBigQueryOperator", + "airflow.contrib.operators.bigquery_to_bigquery.BigQueryToBigQueryOperator", + ), + ( + "airflow.providers.google.cloud.transfers.bigquery_to_gcs.BigQueryToGCSOperator", + "airflow.contrib.operators.bigquery_to_gcs.BigQueryToCloudStorageOperator", + ), + ( + "airflow.providers.google.cloud.transfers.bigquery_to_mysql.BigQueryToMySqlOperator", + "airflow.contrib.operators.bigquery_to_mysql_operator.BigQueryToMySqlOperator", + ), + ( + "airflow.providers.google.cloud.transfers.sql_to_gcs.BaseSQLToGCSOperator", + "airflow.contrib.operators.sql_to_gcs.BaseSQLToGoogleCloudStorageOperator", + ), + ( + "airflow.providers.amazon.aws.transfers.dynamodb_to_s3.DynamoDBToS3Operator", + "airflow.contrib.operators.dynamodb_to_s3.DynamoDBToS3Operator", + ), + ( + "airflow.providers.amazon.aws.transfers.hive_to_dynamodb.HiveToDynamoDBOperator", + "airflow.contrib.operators.hive_to_dynamodb.HiveToDynamoDBOperator", + ), + ( + "airflow.providers.amazon.aws.transfers.imap_attachment_to_s3.ImapAttachmentToS3Operator", + "airflow.contrib.operators.imap_attachment_to_s3_operator.ImapAttachmentToS3Operator", + ), + ( + "airflow.providers.amazon.aws.transfers.mongo_to_s3.MongoToS3Operator", + "airflow.contrib.operators.mongo_to_s3.MongoToS3Operator", + ), + ( + "airflow.providers.amazon.aws.transfers.s3_to_sftp.S3ToSFTPOperator", + "airflow.contrib.operators.s3_to_sftp_operator.S3ToSFTPOperator", + ), + ( + "airflow.providers.amazon.aws.transfers.sftp_to_s3.SFTPToS3Operator", + "airflow.contrib.operators.sftp_to_s3_operator.SFTPToS3Operator", + ), + ( + "airflow.providers.amazon.aws.transfers.gcs_to_s3.GCSToS3Operator", + "airflow.operators.gcs_to_s3.GCSToS3Operator", + ), + ( + "airflow.providers.amazon.aws.transfers.google_api_to_s3.GoogleApiToS3Operator", + "airflow.operators.google_api_to_s3_transfer.GoogleApiToS3Transfer", + ), + ( + "airflow.providers.amazon.aws.transfers.redshift_to_s3.RedshiftToS3Operator", + "airflow.operators.redshift_to_s3_operator.RedshiftToS3Transfer", + ), + ( + "airflow.providers.amazon.aws.transfers.s3_to_redshift.S3ToRedshiftOperator", + "airflow.operators.s3_to_redshift_operator.S3ToRedshiftTransfer", + ), + ( + "airflow.providers.apache.hive.transfers.vertica_to_hive.VerticaToHiveOperator", + "airflow.contrib.operators.vertica_to_hive.VerticaToHiveTransfer", + ), + ( + "airflow.providers.apache.druid.transfers.hive_to_druid.HiveToDruidOperator", + "airflow.operators.hive_to_druid.HiveToDruidTransfer", + ), + ( + "airflow.providers.apache.hive.transfers.hive_to_mysql.HiveToMySqlOperator", + "airflow.operators.hive_to_mysql.HiveToMySqlTransfer", + ), + ( + "airflow.providers.apache.hive.transfers.mysql_to_hive.MySqlToHiveOperator", + "airflow.operators.mysql_to_hive.MySqlToHiveTransfer", + ), + ( + "airflow.providers.apache.hive.transfers.s3_to_hive.S3ToHiveOperator", + "airflow.operators.s3_to_hive_operator.S3ToHiveTransfer", + ), + ( + "airflow.providers.apache.hive.transfers.hive_to_samba.HiveToSambaOperator", + "airflow.operators.hive_to_samba_operator.HiveToSambaOperator", + ), + ( + "airflow.providers.apache.hive.transfers.mssql_to_hive.MsSqlToHiveOperator", + "airflow.operators.mssql_to_hive.MsSqlToHiveTransfer", + ), + ( + "airflow.providers.microsoft.azure.transfers.file_to_wasb.FileToWasbOperator", + "airflow.contrib.operators.file_to_wasb.FileToWasbOperator", + ), + ( + "airflow.providers.google.suite.transfers.gcs_to_gdrive.GCSToGoogleDriveOperator", + "airflow.contrib.operators.gcs_to_gdrive_operator.GCSToGoogleDriveOperator", + ), + ( + "airflow.providers.microsoft.azure.transfers.oracle_to_azure_data_lake" + ".OracleToAzureDataLakeOperator", + "airflow.contrib.operators.oracle_to_azure_data_lake_transfer.OracleToAzureDataLakeOperator", + ), + ( + "airflow.providers.oracle.transfers.oracle_to_oracle.OracleToOracleOperator", + "airflow.contrib.operators.oracle_to_oracle_transfer.OracleToOracleTransfer", + ), + ( + "airflow.providers.google.cloud.transfers.s3_to_gcs.S3ToGCSOperator", + "airflow.contrib.operators.s3_to_gcs_operator.S3ToGCSOperator", + ), + ( + "airflow.providers.mysql.transfers.vertica_to_mysql.VerticaToMySqlOperator", + "airflow.contrib.operators.vertica_to_mysql.VerticaToMySqlTransfer", + ), + ( + "airflow.providers.mysql.transfers.presto_to_mysql.PrestoToMySqlOperator", + "airflow.operators.presto_to_mysql.PrestoToMySqlTransfer", + ), + ( + "airflow.providers.google.cloud.operators.cloud_storage_transfer_service" + ".CloudDataTransferServiceS3ToGCSOperator", + "airflow.contrib.operators.s3_to_gcs_transfer_operator.CloudDataTransferServiceS3ToGCSOperator", + ), + ( + "airflow.providers.google.cloud.transfers.cassandra_to_gcs.CassandraToGCSOperator", + "airflow.contrib.operators.cassandra_to_gcs.CassandraToGoogleCloudStorageOperator", + ), +] + +UTILS = [ + ( + 'airflow.providers.sendgrid.utils.emailer.send_email', + 'airflow.contrib.utils.sendgrid.send_email' + ) +] + +ALL = OPERATORS + HOOKS + SECRETS + SENSORS + TRANSFERS + UTILS diff --git a/airflow/upgrade/rules/send_grid_moved.py b/airflow/upgrade/rules/send_grid_moved.py new file mode 100644 index 0000000000000..0bab17e9cf97e --- /dev/null +++ b/airflow/upgrade/rules/send_grid_moved.py @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +from airflow.configuration import conf +from airflow.upgrade.rules.base_rule import BaseRule + + +class SendGridEmailerMovedRule(BaseRule): + title = "SendGrid email uses old airflow.contrib module" + + description = """ +The SendGrid module `airflow.contrib.utils.sendgrid` was moved to `airflow.providers.sendgrid.utils.emailer`. + """ + + def check(self): + email_conf = conf.get(section="email", key="email_backend") + email_contrib_path = "airflow.contrib.utils.sendgrid" + if email_contrib_path in email_conf: + email_provider_path = "airflow.providers.sendgrid.utils.emailer" + msg = "Email backend option uses airflow.contrib Sendgrid module. " \ + + "Please use new module: {}".format(email_provider_path) + return [msg] diff --git a/airflow/upgrade/rules/spark_jdbc_operator_conn_id_rule.py b/airflow/upgrade/rules/spark_jdbc_operator_conn_id_rule.py new file mode 100644 index 0000000000000..f439408da3abd --- /dev/null +++ b/airflow/upgrade/rules/spark_jdbc_operator_conn_id_rule.py @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.models import Connection +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.utils.db import provide_session + + +class SparkJDBCOperatorConnIdRule(BaseRule): + title = "Check Spark JDBC Operator default connection name" + + description = """\ +In Airflow 1.10.x, the default value for SparkJDBCOperator class 'conn_id' is 'spark-default'. +From Airflow 2.0, it has been changed to 'spark_default' to conform with the naming conventions +of all other connection names. + """ + + @provide_session + def check(self, session=None): + for conn in session.query(Connection.conn_id): + if conn.conn_id == 'spark-default': + return ( + "Deprecation Warning: From Airflow 2.0, the default value of 'conn_id' argument of " + "SparkJDBCOperator class has been changed to 'spark_default' to conform with the naming " + "conventions of all other connection names. Please rename the connection with " + "id 'spark-default' to 'spark_default' or explicitly pass 'spark-default' " + "to the operator. See the link below for details: " + "https://github.com/apache/airflow/blob/2.0.0/" + "UPDATING.md#sparkjdbchook-default-connection" + ) diff --git a/airflow/upgrade/rules/task_handlers_moved.py b/airflow/upgrade/rules/task_handlers_moved.py new file mode 100644 index 0000000000000..e813221b3abd8 --- /dev/null +++ b/airflow/upgrade/rules/task_handlers_moved.py @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow import conf +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.utils.module_loading import import_string + +LOGS = [ + ( + "airflow.providers.amazon.aws.log.s3_task_handler.S3TaskHandler", + "airflow.utils.log.s3_task_handler.S3TaskHandler" + ), + ( + 'airflow.providers.amazon.aws.log.cloudwatch_task_handler.CloudwatchTaskHandler', + 'airflow.utils.log.cloudwatch_task_handler.CloudwatchTaskHandler' + ), + ( + 'airflow.providers.elasticsearch.log.es_task_handler.ElasticsearchTaskHandler', + 'airflow.utils.log.es_task_handler.ElasticsearchTaskHandler' + ), + ( + "airflow.providers.google.cloud.log.stackdriver_task_handler.StackdriverTaskHandler", + "airflow.utils.log.stackdriver_task_handler.StackdriverTaskHandler" + ), + ( + "airflow.providers.google.cloud.log.gcs_task_handler.GCSTaskHandler", + "airflow.utils.log.gcs_task_handler.GCSTaskHandler" + ), + ( + "airflow.providers.microsoft.azure.log.wasb_task_handler.WasbTaskHandler", + "airflow.utils.log.wasb_task_handler.WasbTaskHandler" + ) +] + + +class TaskHandlersMovedRule(BaseRule): + title = "Changes in import path of remote task handlers" + description = ( + "The remote log task handlers have been moved to the providers " + "directory and into their respective providers packages." + ) + + def check(self): + logging_class = conf.get("core", "logging_config_class", fallback=None) + if logging_class: + config = import_string(logging_class) + configured_path = config['handlers']['task']['class'] + for new_path, old_path in LOGS: + if configured_path == old_path: + return [ + "This path : `{old}` should be updated to this path: `{new}`".format(old=old_path, + new=new_path) + ] diff --git a/airflow/upgrade/rules/use_customsqlainterface_class_rule.py b/airflow/upgrade/rules/use_customsqlainterface_class_rule.py new file mode 100644 index 0000000000000..e3db9313d1fb1 --- /dev/null +++ b/airflow/upgrade/rules/use_customsqlainterface_class_rule.py @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.upgrade.rules.base_rule import BaseRule +from airflow.www_rbac.utils import CustomSQLAInterface + + +class UseCustomSQLAInterfaceClassRule(BaseRule): + title = "Use CustomSQLAInterface instead of SQLAInterface for custom data models." + + description = """\ +From Airflow 2.0, if you want to define your own Flask App Builder models you need to +use CustomSQLAInterface instead of SQLAInterface. + +For Non-RBAC replace: + +`from flask_appbuilder.models.sqla.interface import SQLAInterface` +`datamodel = SQLAInterface(your_data_model)` + +with RBAC (in 1.10): + +`from airflow.www_rbac.utils import CustomSQLAInterface` +`datamodel = CustomSQLAInterface(your_data_model)` + +and in 2.0: + +`from airflow.www.utils import CustomSQLAInterface` +`datamodel = CustomSQLAInterface(your_data_model)` + """ + + def check(self): + + from airflow.plugins_manager import flask_appbuilder_views + + plugins_with_sqlainterface_data_model_instance = [] + + if flask_appbuilder_views: + for view_obj in flask_appbuilder_views: + if not isinstance(view_obj.get("view").datamodel, CustomSQLAInterface): + plugins_with_sqlainterface_data_model_instance.append(view_obj.get("name")) + + if plugins_with_sqlainterface_data_model_instance: + return ( + "Deprecation Warning: The following views: {} have " + "data models instantiated " + "from the SQLAInterface class.\n".format(plugins_with_sqlainterface_data_model_instance) + + "See: " + "https://github.com/apache/airflow/blob/master/" + "UPDATING.md#use-customsqlainterface-instead-of-sqlainterface-for-custom-data-models" + ) diff --git a/airflow/upgrade/setup.cfg b/airflow/upgrade/setup.cfg new file mode 100644 index 0000000000000..fdfaff4108948 --- /dev/null +++ b/airflow/upgrade/setup.cfg @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[metadata] +version=attr: airflow.upgrade.version.version +name = apache-airflow-upgrade-check +description = Check for compatibility between Airflow versions +long_description = file: airflow/upgrade/README.md +long_description_content_type = text/markdown +url = https://airflow.apache.org +author = Apache Airflow PMC +author-email = dev@airflow.apache.org +license = Apache License 2.0 +license_files = + LICENSE + NOTICE +classifiers = + Development Status :: 5 - Production/Stable + Intended Audience :: Developers + License :: OSI Approved :: Apache Software License + Programming Language :: Python :: 2.7 + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 +keywords = airflow, upgrade +project_urls = + Source Code=https://github.com/apache/airflow + Bug Tracker=https://github.com/apache/airflow/issues + Documentation=https://airflow.apache.org/docs/ + +[options] +packages = find: +install_requires = + apache-airflow>=1.10.14,<3 + importlib-metadata~=2.0; python_version<"3.8" + packaging +python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.* +setup_requires = + setuptools>=40.0 + wheel +zip_safe = no + +[options.packages.find] +include = + airflow.upgrade + airflow.upgrade.* + +[bdist_wheel] +universal=1 diff --git a/common/_default_branch.sh b/airflow/upgrade/version.py similarity index 93% rename from common/_default_branch.sh rename to airflow/upgrade/version.py index e4c00ec2e118f..f91621565a90f 100644 --- a/common/_default_branch.sh +++ b/airflow/upgrade/version.py @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -16,4 +15,4 @@ # specific language governing permissions and limitations # under the License. -export DEFAULT_BRANCH="v1-10-test" +version = "1.3.0" diff --git a/airflow/utils/cli.py b/airflow/utils/cli.py index 297da404d19b2..578f07ca621ee 100644 --- a/airflow/utils/cli.py +++ b/airflow/utils/cli.py @@ -21,17 +21,23 @@ Utilities module for cli """ from __future__ import absolute_import +from __future__ import print_function import functools import getpass import json +import os import socket +import struct import sys from argparse import Namespace from datetime import datetime +from fcntl import ioctl +from termios import TIOCGWINSZ from airflow.models import Log from airflow.utils import cli_action_loggers +from airflow.utils.platform import is_terminal_support_colors def action_logging(f): @@ -94,9 +100,20 @@ def _build_metrics(func_name, namespace): :param namespace: Namespace instance from argparse :return: dict with metrics """ + sensitive_fields = {'-p', '--password', '--conn-password'} + full_command = list(sys.argv) + for idx, command in enumerate(full_command): # pylint: disable=too-many-nested-blocks + if command in sensitive_fields: + # For cases when password is passed as "--password xyz" (with space between key and value) + full_command[idx + 1] = "*" * 8 + else: + # For cases when password is passed as "--password=xyz" (with '=' between key and value) + for sensitive_field in sensitive_fields: + if command.startswith('{}='.format(sensitive_field)): + full_command[idx] = '{}={}'.format(sensitive_field, "*" * 8) metrics = {'sub_command': func_name, 'start_datetime': datetime.utcnow(), - 'full_command': '{}'.format(list(sys.argv)), 'user': getpass.getuser()} + 'full_command': '{}'.format(full_command), 'user': getpass.getuser()} assert isinstance(namespace, Namespace) tmp_dic = vars(namespace) @@ -116,3 +133,102 @@ def _build_metrics(func_name, namespace): execution_date=metrics.get('execution_date')) metrics['log'] = log return metrics + + +class ColorMode: + """ + Coloring modes. If `auto` is then automatically detected. + """ + ON = "on" + OFF = "off" + AUTO = "auto" + + +def should_use_colors(args): + """ + Processes arguments and decides whether to enable color in output + """ + if args.color == ColorMode.ON: + return True + if args.color == ColorMode.OFF: + return False + return is_terminal_support_colors() + + +def get_terminal_size(fallback=(80, 20)): + """Return a tuple of (terminal height, terminal width).""" + try: + return struct.unpack('hhhh', ioctl(sys.__stdout__, TIOCGWINSZ, '\000' * 8))[0:2] + except IOError: + # when the output stream or init descriptor is not a tty, such + # as when when stdout is piped to another program + pass + try: + return int(os.environ.get('LINES')), int(os.environ.get('COLUMNS')) + except TypeError: + return fallback + + +def header(text, fillchar): + rows, columns = get_terminal_size() + print(" {} ".format(text).center(columns, fillchar)) + + +def deprecated_action(func=None, new_name=None, sub_commands=False): + if not func: + return functools.partial(deprecated_action, new_name=new_name, sub_commands=sub_commands) + + stream = sys.stderr + try: + from pip._vendor import colorama + WINDOWS = (sys.platform.startswith("win") or + (sys.platform == 'cli' and os.name == 'nt')) + if WINDOWS: + stream = colorama.AnsiToWin32(sys.stderr) + except Exception: + colorama = None + + def should_color(): + # Don't colorize things if we do not have colorama or if told not to + if not colorama: + return False + + real_stream = ( + stream if not isinstance(stream, colorama.AnsiToWin32) + else stream.wrapped + ) + + # If the stream is a tty we should color it + if hasattr(real_stream, "isatty") and real_stream.isatty(): + return True + + if os.environ.get("TERM") and "color" in os.environ.get("TERM"): + return True + + # If anything else we should not color it + return False + + @functools.wraps(func) + def wrapper(args): + if getattr(args, 'deprecation_warning', True): + command = args.subcommand or args.func.__name__ + if sub_commands: + msg = ( + "The mode (-l, -d, etc) options to {!r} have been deprecated and removed in Airflow 2.0," + " please use the get/set/list subcommands instead" + ).format(command) + else: + prefix = "The {!r} command is deprecated and removed in Airflow 2.0, please use " + if isinstance(new_name, list): + msg = prefix.format(args.subcommand) + new_names = list(map(repr, new_name)) + msg += "{}, or {}".format(", ".join(new_names[:-1]), new_names[-1]) + msg += " instead" + else: + msg = (prefix + "{!r} instead").format(command, new_name) + + if should_color(): + msg = "".join([colorama.Fore.YELLOW, msg, colorama.Style.RESET_ALL]) + print(msg, file=sys.stderr) + func(args) + return wrapper diff --git a/airflow/utils/code_utils.py b/airflow/utils/code_utils.py new file mode 100644 index 0000000000000..3a9fbcba4b493 --- /dev/null +++ b/airflow/utils/code_utils.py @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def prepare_code_snippet(file_path, line_no, context_lines_count=5): + """ + Prepare code snippet with line numbers and a specific line marked. + + :param file_path: File nam + :param line_no: Line number + :param context_lines_count: The number of lines that will be cut before and after. + :return: str + """ + with open(file_path) as text_file: + # Highlight code + code = text_file.read() + code_lines = code.split("\n") + # Prepend line number + code_lines = [ + ">{lno:3} | {line}".format(lno=lno, line=line) + if line_no == lno else "{lno:4} | {line}".format(lno=lno, line=line) + for lno, line in enumerate(code_lines, 1) + ] + # # Cut out the snippet + start_line_no = max(0, line_no - context_lines_count - 1) + end_line_no = line_no + context_lines_count + code_lines = code_lines[start_line_no:end_line_no] + # Join lines + code = "\n".join(code_lines) + return code diff --git a/airflow/utils/dag_processing.py b/airflow/utils/dag_processing.py index c888726f4fcb8..58e6af01b785c 100644 --- a/airflow/utils/dag_processing.py +++ b/airflow/utils/dag_processing.py @@ -50,16 +50,19 @@ from airflow.exceptions import AirflowException from airflow.settings import Stats from airflow.models import errors -from airflow.settings import STORE_SERIALIZED_DAGS +from airflow.settings import STORE_DAG_CODE, STORE_SERIALIZED_DAGS from airflow.utils import timezone from airflow.utils.helpers import reap_process_group from airflow.utils.db import provide_session from airflow.utils.log.logging_mixin import LoggingMixin +from airflow.utils.mixins import MultiprocessingStartMethodMixin from airflow.utils.state import State if six.PY2: ConnectionError = IOError +log = logging.getLogger(__name__) + class SimpleDag(BaseDag): """ @@ -76,7 +79,6 @@ def __init__(self, dag, pickle_id=None): self._dag_id = dag.dag_id self._task_ids = [task.task_id for task in dag.tasks] self._full_filepath = dag.full_filepath - self._is_paused = dag.is_paused self._concurrency = dag.concurrency self._pickle_id = pickle_id self._task_special_args = {} @@ -119,14 +121,6 @@ def concurrency(self): """ return self._concurrency - @property - def is_paused(self): - """ - :return: whether this DAG is paused or not - :rtype: bool - """ - return self._is_paused - @property def pickle_id(self): """ @@ -375,7 +369,6 @@ def list_py_file_paths(directory, safe_mode=conf.getboolean('core', 'DAG_DISCOVE file_paths.append(file_path) except Exception: - log = LoggingMixin().log log.exception("Error while examining %s", f) if include_examples: import airflow.example_dags @@ -482,7 +475,7 @@ class DagParsingSignal(enum.Enum): END_MANAGER = 'end_manager' -class DagFileProcessorAgent(LoggingMixin): +class DagFileProcessorAgent(LoggingMixin, MultiprocessingStartMethodMixin): """ Agent for DAG file processing. It is responsible for all DAG parsing related jobs in scheduler process. Mainly it can spin up DagFileProcessorManager @@ -498,6 +491,8 @@ def __init__(self, max_runs, processor_factory, processor_timeout, + dag_ids, + pickle_dags, async_mode): """ :param dag_directory: Directory where DAG definitions are kept. All @@ -522,6 +517,8 @@ def __init__(self, self._max_runs = max_runs self._processor_factory = processor_factory self._processor_timeout = processor_timeout + self._dag_ids = dag_ids + self._pickle_dags = pickle_dags self._async_mode = async_mode # Map from file path to the processor self._processors = {} @@ -538,16 +535,25 @@ def start(self): """ Launch DagFileProcessorManager processor and start DAG parsing loop in manager. """ - self._parent_signal_conn, child_signal_conn = multiprocessing.Pipe() - self._process = multiprocessing.Process( + if six.PY2: + context = multiprocessing + else: + mp_start_method = self._get_multiprocessing_start_method() + context = multiprocessing.get_context(mp_start_method) + + self._parent_signal_conn, child_signal_conn = context.Pipe() + self._process = context.Process( target=type(self)._run_processor_manager, args=( self._dag_directory, self._file_paths, self._max_runs, - self._processor_factory, + # getattr prevents error while pickling an instance method. + getattr(self, "_processor_factory"), self._processor_timeout, child_signal_conn, + self._dag_ids, + self._pickle_dags, self._async_mode, ) ) @@ -593,6 +599,8 @@ def _run_processor_manager(dag_directory, processor_factory, processor_timeout, signal_conn, + dag_ids, + pickle_dags, async_mode): # Make this process start as a new process group - that makes it easy @@ -619,6 +627,8 @@ def _run_processor_manager(dag_directory, processor_factory, processor_timeout, signal_conn, + dag_ids, + pickle_dags, async_mode) processor_manager.start() @@ -731,6 +741,10 @@ class DagFileProcessorManager(LoggingMixin): :type processor_timeout: timedelta :param signal_conn: connection to communicate signal with processor agent. :type signal_conn: airflow.models.connection.Connection + :param dag_ids: if specified, only schedule tasks with these DAG IDs + :type dag_ids: list[str] + :param pickle_dags: whether to pickle DAGs. + :type pickle_dags: bool :param async_mode: whether to start the manager in async mode :type async_mode: bool """ @@ -742,6 +756,8 @@ def __init__(self, processor_factory, processor_timeout, signal_conn, + dag_ids, + pickle_dags, async_mode=True): self._file_paths = file_paths self._file_path_queue = [] @@ -749,12 +765,14 @@ def __init__(self, self._max_runs = max_runs self._processor_factory = processor_factory self._signal_conn = signal_conn + self._pickle_dags = pickle_dags + self._dag_ids = dag_ids self._async_mode = async_mode - self._parallelism = conf.getint('scheduler', 'max_threads') + self._parallelism = conf.getint('scheduler', 'parsing_processes') if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1: self.log.warning( - "Because we cannot use more than 1 thread (max_threads = {}) " + "Because we cannot use more than 1 thread (parsing_processes = {}) " "when using sqlite. So we set parallelism to 1.".format(self._parallelism) ) self._parallelism = 1 @@ -772,7 +790,7 @@ def __init__(self, # Map from file path to the processor self._processors = {} - self._heartbeat_count = 0 + self._num_run = 0 # Map from file path to stats about the file self._file_stats = {} # type: dict(str, DagFileStat) @@ -852,11 +870,24 @@ def start(self): # are told to (as that would open another connection to the # SQLite DB which isn't a good practice continue - + # pylint: enable=no-else-break self._refresh_dag_dir() - self._find_zombies() + self._find_zombies() # pylint: disable=no-value-for-parameter + + self._kill_timed_out_processors() + simple_dags = self.collect_results() + + # Generate more file paths to process if we processed all the files + # already. + if not self._file_path_queue: + self.emit_metrics() + self.prepare_file_path_queue() + + self.start_new_processes() + + # Update number of loop iteration. + self._num_run += 1 - simple_dags = self.heartbeat() for simple_dag in simple_dags: self._signal_conn.send(simple_dag) @@ -923,7 +954,7 @@ def _refresh_dag_dir(self): SerializedDagModel.remove_deleted_dags(self._file_paths) DagModel.deactivate_deleted_dags(self._file_paths) - if conf.getboolean('core', 'store_dag_code', fallback=False): + if STORE_DAG_CODE: from airflow.models.dagcode import DagCode DagCode.remove_deleted_code(self._file_paths) @@ -991,7 +1022,7 @@ def _log_file_processing_stats(self, known_file_paths): processor_pid = self.get_pid(file_path) processor_start_time = self.get_start_time(file_path) - runtime = ((now - processor_start_time).total_seconds() if processor_start_time else None) + runtime = ((now - processor_start_time) if processor_start_time else None) last_run = self.get_last_finish_time(file_path) if last_run: seconds_ago = (now - last_run).total_seconds() @@ -1016,7 +1047,7 @@ def _log_file_processing_stats(self, known_file_paths): for file_path, pid, runtime, num_dags, num_errors, last_runtime, last_run in rows: formatted_rows.append((file_path, pid, - "{:.2f}s".format(runtime) if runtime else None, + "{:.2f}s".format(runtime.total_seconds()) if runtime else None, num_dags, num_errors, "{:.2f}s".format(last_runtime) if last_runtime else None, @@ -1206,67 +1237,14 @@ def collect_results(self): return simple_dags - def heartbeat(self): - """ - This should be periodically called by the manager loop. This method will - kick off new processes to process DAG definition files and read the - results from the finished processors. - - :return: a list of SimpleDags that were produced by processors that - have finished since the last time this was called - :rtype: list[airflow.utils.dag_processing.SimpleDag] + def start_new_processes(self): + """" + Start more processors if we have enough slots and files to process """ - simple_dags = self.collect_results() - - # Generate more file paths to process if we processed all the files - # already. - if len(self._file_path_queue) == 0: - self.emit_metrics() - - self._parsing_start_time = timezone.utcnow() - # If the file path is already being processed, or if a file was - # processed recently, wait until the next batch - file_paths_in_progress = self._processors.keys() - now = timezone.utcnow() - file_paths_recently_processed = [] - for file_path in self._file_paths: - last_finish_time = self.get_last_finish_time(file_path) - if (last_finish_time is not None and - (now - last_finish_time).total_seconds() < - self._file_process_interval): - file_paths_recently_processed.append(file_path) - - files_paths_at_run_limit = [file_path - for file_path, stat in self._file_stats.items() - if stat.run_count == self._max_runs] - - files_paths_to_queue = list(set(self._file_paths) - - set(file_paths_in_progress) - - set(file_paths_recently_processed) - - set(files_paths_at_run_limit)) - - for file_path, processor in self._processors.items(): - self.log.debug( - "File path %s is still being processed (started: %s)", - processor.file_path, processor.start_time.isoformat() - ) - - self.log.debug( - "Queuing the following files for processing:\n\t%s", - "\n\t".join(files_paths_to_queue) - ) - - for file_path in files_paths_to_queue: - if file_path not in self._file_stats: - self._file_stats[file_path] = DagFileStat(0, 0, None, None, 0) - - self._file_path_queue.extend(files_paths_to_queue) - - # Start more processors if we have enough slots and files to process - while (self._parallelism - len(self._processors) > 0 and - len(self._file_path_queue) > 0): + while self._parallelism - len(self._processors) > 0 and self._file_path_queue: file_path = self._file_path_queue.pop(0) - processor = self._processor_factory(file_path, self._zombies) + processor = self._processor_factory(file_path, self._zombies, + self._dag_ids, self._pickle_dags) Stats.incr('dag_processing.processes') processor.start() @@ -1276,10 +1254,48 @@ def heartbeat(self): ) self._processors[file_path] = processor - # Update heartbeat count. - self._heartbeat_count += 1 + def prepare_file_path_queue(self): + """ + Generate more file paths to process. Result are saved in _file_path_queue. + """ + self._parsing_start_time = timezone.utcnow() + # If the file path is already being processed, or if a file was + # processed recently, wait until the next batch + file_paths_in_progress = self._processors.keys() + now = timezone.utcnow() + file_paths_recently_processed = [] + for file_path in self._file_paths: + last_finish_time = self.get_last_finish_time(file_path) + if (last_finish_time is not None and + (now - last_finish_time).total_seconds() < + self._file_process_interval): + file_paths_recently_processed.append(file_path) + + files_paths_at_run_limit = [file_path + for file_path, stat in self._file_stats.items() + if stat.run_count == self._max_runs] + + files_paths_to_queue = list(set(self._file_paths) - + set(file_paths_in_progress) - + set(file_paths_recently_processed) - + set(files_paths_at_run_limit)) - return simple_dags + for file_path, processor in self._processors.items(): + self.log.debug( + "File path %s is still being processed (started: %s)", + processor.file_path, processor.start_time.isoformat() + ) + + self.log.debug( + "Queuing the following files for processing:\n\t%s", + "\n\t".join(files_paths_to_queue) + ) + + for file_path in files_paths_to_queue: + if file_path not in self._file_stats: + self._file_stats[file_path] = DagFileStat(0, 0, None, None, 0) + + self._file_path_queue.extend(files_paths_to_queue) @provide_session def _find_zombies(self, session): @@ -1292,7 +1308,7 @@ def _find_zombies(self, session): if not self._last_zombie_query_time or \ (now - self._last_zombie_query_time).total_seconds() > self._zombie_query_interval: # to avoid circular imports - from airflow.jobs import LocalTaskJob as LJ + from airflow.jobs.local_task_job import LocalTaskJob as LJ self.log.info("Finding 'running' jobs without a recent heartbeat") TI = airflow.models.TaskInstance limit_dttm = timezone.utcnow() - timedelta( @@ -1347,7 +1363,7 @@ def max_runs_reached(self): for stat in self._file_stats.values(): if stat.run_count < self._max_runs: return False - if self._heartbeat_count < self._max_runs: + if self._num_run < self._max_runs: return False return True diff --git a/airflow/utils/dates.py b/airflow/utils/dates.py index b3c4a77a22f6d..680336637cdaf 100644 --- a/airflow/utils/dates.py +++ b/airflow/utils/dates.py @@ -34,6 +34,7 @@ '@daily': '0 0 * * *', '@weekly': '0 0 * * 0', '@monthly': '0 0 1 * *', + '@quarterly': '0 0 1 */3 *', '@yearly': '0 0 1 1 *', } diff --git a/airflow/utils/db.py b/airflow/utils/db.py index d689d34200162..89c92aaee8bd3 100644 --- a/airflow/utils/db.py +++ b/airflow/utils/db.py @@ -24,6 +24,7 @@ from functools import wraps +import logging import os import contextlib import json @@ -32,9 +33,8 @@ from airflow import settings from airflow.configuration import conf -from airflow.utils.log.logging_mixin import LoggingMixin -log = LoggingMixin().log +log = logging.getLogger(__name__) ENV_TIMESCALE_ENABLE = strtobool(os.environ.get('ENV_TIMESCALE_ENABLE', 'false')) diff --git a/airflow/utils/email.py b/airflow/utils/email.py index ee764f7e1ad0c..42f0056b6409f 100644 --- a/airflow/utils/email.py +++ b/airflow/utils/email.py @@ -29,6 +29,7 @@ from collections import Iterable as CollectionIterable import importlib +import logging import os import smtplib @@ -40,7 +41,8 @@ from airflow.configuration import conf from airflow.exceptions import AirflowConfigException -from airflow.utils.log.logging_mixin import LoggingMixin + +log = logging.getLogger(__name__) def send_email(to, subject, html_content, @@ -107,7 +109,6 @@ def send_email_smtp(to, subject, html_content, files=None, def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): - log = LoggingMixin().log SMTP_HOST = conf.get('smtp', 'SMTP_HOST') SMTP_PORT = conf.getint('smtp', 'SMTP_PORT') diff --git a/airflow/utils/helpers.py b/airflow/utils/helpers.py index 05c6e4dd5e363..4913c4da96ec3 100644 --- a/airflow/utils/helpers.py +++ b/airflow/utils/helpers.py @@ -299,7 +299,7 @@ def signal_procs(sig): # use sudo -n(--non-interactive) to kill the process if err.errno == errno.EPERM: subprocess.check_call( - ["sudo", "-n", "kill", "-" + str(sig)] + [str(p.pid) for p in children] + ["sudo", "-n", "kill", "-" + str(int(sig))] + [str(p.pid) for p in children] ) else: raise diff --git a/airflow/utils/log/colored_log.py b/airflow/utils/log/colored_log.py index a89e779f5fe9b..8f92d80cbf0e9 100644 --- a/airflow/utils/log/colored_log.py +++ b/airflow/utils/log/colored_log.py @@ -23,9 +23,7 @@ import sys from colorlog import TTYColoredFormatter -from termcolor import colored - -ARGS = {"attrs": ["bold"]} +from colorlog.escape_codes import esc, escape_codes DEFAULT_COLORS = { "DEBUG": "red", @@ -35,6 +33,9 @@ "CRITICAL": "red", } +BOLD_ON = escape_codes['bold'] +BOLD_OFF = esc('22') + class CustomTTYColoredFormatter(TTYColoredFormatter): """ @@ -52,7 +53,7 @@ def _color_arg(arg): if isinstance(arg, (int, float)): # In case of %d or %f formatting return arg - return colored(str(arg), **ARGS) # type: ignore + return BOLD_ON + str(arg) + BOLD_OFF @staticmethod def _count_number_of_arguments_in_message(record): @@ -83,7 +84,9 @@ def _color_record_traceback(self, record): record.exc_text = self.formatException(record.exc_info) if record.exc_text: - record.exc_text = colored(record.exc_text, DEFAULT_COLORS["ERROR"]) + record.exc_text = self.color(self.log_colors, record.levelname) + \ + record.exc_text + escape_codes['reset'] + return record def format(self, record): diff --git a/airflow/utils/log/es_task_handler.py b/airflow/utils/log/es_task_handler.py index b986c0f0a5d7a..de85f7c2bb464 100644 --- a/airflow/utils/log/es_task_handler.py +++ b/airflow/utils/log/es_task_handler.py @@ -181,7 +181,7 @@ def es_read(self, log_id, offset, metadata): try: metadata['max_offset'] = s[max_log_line - 1].execute()[-1].offset if max_log_line > 0 else 0 except Exception: - self.log.exception('Could not get current log size with log_id: {}'.format(log_id)) + self.log.exception('Could not get current log size with log_id: %s', log_id) logs = [] if max_log_line != 0: @@ -203,7 +203,7 @@ def set_context(self, ti): self.mark_end_on_close = not ti.raw if self.json_format: - self.formatter = JSONFormatter(self.formatter._fmt, json_fields=self.json_fields, extras={ + self.formatter = JSONFormatter(json_fields=self.json_fields, extras={ 'dag_id': str(ti.dag_id), 'task_id': str(ti.task_id), 'execution_date': self._clean_execution_date(ti.execution_date), diff --git a/airflow/utils/log/file_task_handler.py b/airflow/utils/log/file_task_handler.py index 4496a5bfdd0fe..ef359110125c4 100644 --- a/airflow/utils/log/file_task_handler.py +++ b/airflow/utils/log/file_task_handler.py @@ -108,6 +108,42 @@ def _read(self, ti, try_number, metadata=None): except Exception as e: log = "*** Failed to load local log file: {}\n".format(location) log += "*** {}\n".format(str(e)) + elif conf.get('core', 'executor') == 'KubernetesExecutor': # pylint: disable=too-many-nested-blocks + try: + from airflow.kubernetes.kube_client import get_kube_client + + kube_client = get_kube_client() + + if len(ti.hostname) >= 63: + # Kubernetes takes the pod name and truncates it for the hostname. This trucated hostname + # is returned for the fqdn to comply with the 63 character limit imposed by DNS standards + # on any label of a FQDN. + pod_list = kube_client.list_namespaced_pod(conf.get('kubernetes', 'namespace')) + matches = [pod.metadata.name for pod in pod_list.items + if pod.metadata.name.startswith(ti.hostname)] + if len(matches) == 1: + if len(matches[0]) > len(ti.hostname): + ti.hostname = matches[0] + + log += '*** Trying to get logs (last 100 lines) from worker pod {} ***\n\n'\ + .format(ti.hostname) + + res = kube_client.read_namespaced_pod_log( + name=ti.hostname, + namespace=conf.get('kubernetes', 'namespace'), + container='base', + follow=False, + tail_lines=100, + _preload_content=False + ) + + for line in res: + log += line.decode() + + except Exception as f: # pylint: disable=broad-except + log += '*** Unable to fetch logs from worker pod {} ***\n{}\n\n'.format( + ti.hostname, str(f) + ) else: url = os.path.join( "http://{ti.hostname}:{worker_log_server_port}/log", log_relative_path @@ -208,6 +244,9 @@ def _init_file(self, ti): if not os.path.exists(full_path): open(full_path, "a").close() # TODO: Investigate using 444 instead of 666. - os.chmod(full_path, 0o666) + try: + os.chmod(full_path, 0o666) + except OSError: + logging.warning("OSError while change ownership of the log file") return full_path diff --git a/airflow/utils/log/json_formatter.py b/airflow/utils/log/json_formatter.py index 1d90bc396d9c7..3cf45302accc2 100644 --- a/airflow/utils/log/json_formatter.py +++ b/airflow/utils/log/json_formatter.py @@ -53,6 +53,9 @@ def __init__(self, fmt=None, datefmt=None, json_fields=None, extras=None): self.json_fields = json_fields self.extras = extras + def usesTime(self): + return self.json_fields.count('asctime') > 0 + def format(self, record): super(JSONFormatter, self).format(record) record_dict = {label: getattr(record, label, None) diff --git a/airflow/utils/log/logging_mixin.py b/airflow/utils/log/logging_mixin.py index 3e5991e0e348a..95d137d3bc588 100644 --- a/airflow/utils/log/logging_mixin.py +++ b/airflow/utils/log/logging_mixin.py @@ -70,7 +70,7 @@ def log(self): try: return self._log except AttributeError: - self._log = logging.root.getChild( + self._log = logging.getLogger( self.__class__.__module__ + '.' + self.__class__.__name__ ) return self._log @@ -95,6 +95,14 @@ def __init__(self, logger, level): self.level = level self._buffer = str() + def close(self): + """ + Provide close method, for compatibility with the io.IOBase interface. + + This is a no-op method. + """ + pass + @property def closed(self): """ diff --git a/airflow/utils/mixins.py b/airflow/utils/mixins.py new file mode 100644 index 0000000000000..9bbc1fea9bfd7 --- /dev/null +++ b/airflow/utils/mixins.py @@ -0,0 +1,36 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import multiprocessing + +from airflow.configuration import conf + + +class MultiprocessingStartMethodMixin: + """ + Convenience class to add support for different types of multiprocessing. + """ + def _get_multiprocessing_start_method(self): + """ + Determine method of creating new processes by checking if the + mp_start_method is set in configs, else, it uses the OS default. + """ + if conf.has_option('core', 'mp_start_method'): + return conf.get('core', 'mp_start_method') + + return multiprocessing.get_start_method() diff --git a/airflow/utils/net.py b/airflow/utils/net.py index 206102591344a..14a446765ce16 100644 --- a/airflow/utils/net.py +++ b/airflow/utils/net.py @@ -43,7 +43,10 @@ def get_hostname(): return socket.getfqdn() # Since we have a callable path, we try to import and run it next. - module_path, attr_name = callable_path.split(':') - module = importlib.import_module(module_path) - callable = getattr(module, attr_name) - return callable() + if ":" in callable_path: + module_path, attr_name = callable_path.split(':') + module = importlib.import_module(module_path) + callable = getattr(module, attr_name) + return callable() + else: + return conf.getimport('core', 'hostname_callable', fallback='socket.getfqdn')() diff --git a/airflow/utils/platform.py b/airflow/utils/platform.py new file mode 100644 index 0000000000000..3630783269007 --- /dev/null +++ b/airflow/utils/platform.py @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Platform and system specific function. +""" +import os +import sys + +import six + + +def is_tty(): + """ + Checks if the standard output is s connected (is associated with a terminal device) to a tty(-like) device + """ + if six.PY2: + _, w = os.pipe() + return os.isatty(w) + if not hasattr(sys.stdout, "isatty"): + return False + return sys.stdout.isatty() + + +def is_terminal_support_colors(): + """" + Try to determine if the current terminal supports colors. + """ + if sys.platform == "win32": + return False + if not is_tty(): + return False + if "COLORTERM" in os.environ: + return True + term = os.environ.get("TERM", "dumb").lower() + if term in ("xterm", "linux") or "color" in term: + return True + return False diff --git a/airflow/utils/sqlalchemy.py b/airflow/utils/sqlalchemy.py index 1939440b3879c..234ce4336e273 100644 --- a/airflow/utils/sqlalchemy.py +++ b/airflow/utils/sqlalchemy.py @@ -23,6 +23,7 @@ from __future__ import unicode_literals import datetime +import logging import os import json import pendulum @@ -32,9 +33,8 @@ from sqlalchemy.types import Text, DateTime, TypeDecorator from airflow.configuration import conf -from airflow.utils.log.logging_mixin import LoggingMixin -log = LoggingMixin().log +log = logging.getLogger(__name__) utc = pendulum.timezone('UTC') using_mysql = conf.get('core', 'sql_alchemy_conn').lower().startswith('mysql') diff --git a/airflow/utils/state.py b/airflow/utils/state.py index 320b996d5d500..bb3ad3960e5f5 100644 --- a/airflow/utils/state.py +++ b/airflow/utils/state.py @@ -21,6 +21,8 @@ from builtins import object +from airflow.settings import STATE_COLORS + class State(object): """ @@ -80,6 +82,7 @@ class State(object): SCHEDULED: 'tan', NONE: 'lightblue', } + state_color.update(STATE_COLORS) # type: ignore @classmethod def color(cls, state): diff --git a/airflow/www/api/experimental/endpoints.py b/airflow/www/api/experimental/endpoints.py index 6a23089aa9328..91d060d7dbe9c 100644 --- a/airflow/www/api/experimental/endpoints.py +++ b/airflow/www/api/experimental/endpoints.py @@ -20,6 +20,8 @@ g, Blueprint, jsonify, request, url_for ) +import logging + import airflow.api from airflow.api.common.experimental import delete_dag as delete from airflow.api.common.experimental import pool as pool_api @@ -31,12 +33,11 @@ from airflow.api.common.experimental.get_dag_run_state import get_dag_run_state from airflow.exceptions import AirflowException from airflow.utils import timezone -from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.strings import to_boolean from airflow.www.app import csrf from airflow import models -_log = LoggingMixin().log +log = logging.getLogger(__name__) requires_authentication = airflow.api.API_AUTH.api_auth.requires_authentication @@ -73,7 +74,7 @@ def trigger_dag(dag_id): 'Given execution date, {}, could not be identified ' 'as a date. Example date format: 2015-11-16T14:34:15+00:00'.format( execution_date)) - _log.info(error_message) + log.info(error_message) response = jsonify({'error': error_message}) response.status_code = 400 @@ -86,13 +87,13 @@ def trigger_dag(dag_id): try: dr = trigger.trigger_dag(dag_id, run_id, conf, execution_date, replace_microseconds) except AirflowException as err: - _log.error(err) + log.error(err) response = jsonify(error="{}".format(err)) response.status_code = err.status_code return response if getattr(g, 'user', None): - _log.info("User %s created %s", g.user, dr) + log.info("User %s created %s", g.user, dr) response = jsonify( message="Created {}".format(dr), @@ -112,7 +113,7 @@ def delete_dag(dag_id): try: count = delete.delete_dag(dag_id) except AirflowException as err: - _log.error(err) + log.error(err) response = jsonify(error="{}".format(err)) response.status_code = err.status_code return response @@ -134,7 +135,7 @@ def dag_runs(dag_id): state = request.args.get('state') dagruns = get_dag_runs(dag_id, state, run_url_route='airflow.graph') except AirflowException as err: - _log.info(err) + log.info(err) response = jsonify(error="{}".format(err)) response.status_code = 400 return response @@ -155,7 +156,7 @@ def get_dag_code(dag_id): try: return get_code(dag_id) except AirflowException as err: - _log.info(err) + log.info(err) response = jsonify(error="{}".format(err)) response.status_code = err.status_code return response @@ -168,7 +169,7 @@ def task_info(dag_id, task_id): try: info = get_task(dag_id, task_id) except AirflowException as err: - _log.info(err) + log.info(err) response = jsonify(error="{}".format(err)) response.status_code = err.status_code return response @@ -225,7 +226,7 @@ def task_instance_info(dag_id, execution_date, task_id): 'Given execution date, {}, could not be identified ' 'as a date. Example date format: 2015-11-16T14:34:15+00:00'.format( execution_date)) - _log.info(error_message) + log.info(error_message) response = jsonify({'error': error_message}) response.status_code = 400 @@ -234,7 +235,7 @@ def task_instance_info(dag_id, execution_date, task_id): try: info = get_task_instance(dag_id, task_id, execution_date) except AirflowException as err: - _log.info(err) + log.info(err) response = jsonify(error="{}".format(err)) response.status_code = err.status_code return response @@ -266,7 +267,7 @@ def dag_run_status(dag_id, execution_date): 'Given execution date, {}, could not be identified ' 'as a date. Example date format: 2015-11-16T14:34:15+00:00'.format( execution_date)) - _log.info(error_message) + log.info(error_message) response = jsonify({'error': error_message}) response.status_code = 400 @@ -275,7 +276,7 @@ def dag_run_status(dag_id, execution_date): try: info = get_dag_run_state(dag_id, execution_date) except AirflowException as err: - _log.info(err) + log.info(err) response = jsonify(error="{}".format(err)) response.status_code = err.status_code return response @@ -310,7 +311,7 @@ def get_pool(name): try: pool = pool_api.get_pool(name=name) except AirflowException as err: - _log.error(err) + log.error(err) response = jsonify(error="{}".format(err)) response.status_code = err.status_code return response @@ -325,7 +326,7 @@ def get_pools(): try: pools = pool_api.get_pools() except AirflowException as err: - _log.error(err) + log.error(err) response = jsonify(error="{}".format(err)) response.status_code = err.status_code return response @@ -342,7 +343,7 @@ def create_pool(): try: pool = pool_api.create_pool(**params) except AirflowException as err: - _log.error(err) + log.error(err) response = jsonify(error="{}".format(err)) response.status_code = err.status_code return response @@ -358,7 +359,7 @@ def delete_pool(name): try: pool = pool_api.delete_pool(name=name) except AirflowException as err: - _log.error(err) + log.error(err) response = jsonify(error="{}".format(err)) response.status_code = err.status_code return response diff --git a/airflow/www/app.py b/airflow/www/app.py index 762190b3b6cb3..8c4983d47fe20 100644 --- a/airflow/www/app.py +++ b/airflow/www/app.py @@ -17,8 +17,13 @@ # specific language governing permissions and limitations # under the License. # +import datetime +import logging +import os from typing import Any +import flask +import flask_login import six from flask import Flask from flask_babel import Babel @@ -30,10 +35,10 @@ from werkzeug.middleware.dispatcher import DispatcherMiddleware import airflow -from airflow import models, version, LoggingMixin +from airflow import models, version from airflow.configuration import conf from airflow.models.connection import Connection -from airflow.settings import Session +from airflow.settings import Session, STATE_COLORS from airflow.www.blueprints import routes from airflow.logging_config import configure_logging @@ -42,9 +47,11 @@ from airflow.utils.net import get_hostname csrf = CSRFProtect() +log = logging.getLogger(__name__) def create_app(config=None, testing=False): + app = Flask(__name__) if conf.getboolean('webserver', 'ENABLE_PROXY_FIX'): app.wsgi_app = ProxyFix( @@ -55,10 +62,12 @@ def create_app(config=None, testing=False): x_port=conf.getint("webserver", "PROXY_FIX_X_PORT", fallback=1), x_prefix=conf.getint("webserver", "PROXY_FIX_X_PREFIX", fallback=1) ) - app.secret_key = conf.get('webserver', 'SECRET_KEY') + app.config['PERMANENT_SESSION_LIFETIME'] = datetime.timedelta(minutes=settings.get_session_lifetime_config()) app.config['LOGIN_DISABLED'] = not conf.getboolean( 'webserver', 'AUTHENTICATE') + app.secret_key = conf.get('webserver', 'SECRET_KEY') + app.config['SESSION_COOKIE_HTTPONLY'] = True app.config['SESSION_COOKIE_SECURE'] = conf.getboolean('webserver', 'COOKIE_SECURE') app.config['SESSION_COOKIE_SAMESITE'] = conf.get('webserver', 'COOKIE_SAMESITE') @@ -74,6 +83,9 @@ def create_app(config=None, testing=False): if config: app.config.from_mapping(config) + if 'SQLALCHEMY_ENGINE_OPTIONS' not in app.config: + app.config['SQLALCHEMY_ENGINE_OPTIONS'] = settings.prepare_engine_args() + csrf.init_app(app) app.config['TESTING'] = testing @@ -135,9 +147,9 @@ def create_app(config=None, testing=False): models.XCom, Session, name="XComs", category="Admin")) if "dev" in version.version: - airflow_doc_site = "https://airflow.readthedocs.io/en/latest" + airflow_doc_site = "https://s.apache.org/airflow-docs" else: - airflow_doc_site = 'https://airflow.apache.org/docs/{}'.format(version.version) + airflow_doc_site = 'https://airflow.apache.org/docs/apache-airflow/{}'.format(version.version) admin.add_link(base.MenuLink( name="Website", @@ -161,7 +173,6 @@ def create_app(config=None, testing=False): def integrate_plugins(): """Integrate plugins to the context""" - log = LoggingMixin().log from airflow.plugins_manager import ( admin_views, flask_blueprints, menu_links) for v in admin_views: @@ -199,7 +210,8 @@ def jinja_globals(): 'log_auto_tailing_offset': conf.getint( 'webserver', 'log_auto_tailing_offset', fallback=30), 'log_animation_speed': conf.getint( - 'webserver', 'log_animation_speed', fallback=1000) + 'webserver', 'log_animation_speed', fallback=1000), + 'state_color_mapping': STATE_COLORS } @app.before_request diff --git a/airflow/www/static/connection_form.js b/airflow/www/static/connection_form.js index 8517ad2582bf4..2ffab8294873e 100644 --- a/airflow/www/static/connection_form.js +++ b/airflow/www/static/connection_form.js @@ -70,6 +70,14 @@ 'login': 'Username', } }, + yandexcloud: { + hidden_fields: ['host', 'schema', 'login', 'password', 'port', 'extra'], + relabeling: {}, + }, + spark: { + hidden_fields: ['schema', 'login', 'password'], + relabeling: {}, + }, } function connTypeChange(connectionType) { $("div.form-group").removeClass("hide"); diff --git a/airflow/www/static/main.css b/airflow/www/static/main.css index 2740a0d42aaf3..c5008b8efbdb5 100644 --- a/airflow/www/static/main.css +++ b/airflow/www/static/main.css @@ -224,6 +224,10 @@ div.square { box-shadow: inset 0 6px 6px rgba(0, 0, 0, 0.4); } +.dag-doc { + margin-bottom: 15px; +} + .hll { background-color: #ffffcc } .c { color: #408080; font-style: italic } /* Comment */ .err { border: 1px solid #FF0000 } /* Error */ diff --git a/airflow/www/static/underscore.js b/airflow/www/static/underscore.js index 70fae3f6908db..72525399a1dd8 100644 --- a/airflow/www/static/underscore.js +++ b/airflow/www/static/underscore.js @@ -806,7 +806,7 @@ return obj; }; - // Return a copy of the object only containing the whitelisted properties. + // Return a copy of the object only containing the allowed list properties. _.pick = function(obj) { var copy = {}; var keys = concat.apply(ArrayProto, slice.call(arguments, 1)); @@ -816,7 +816,7 @@ return copy; }; - // Return a copy of the object without the blacklisted properties. + // Return a copy of the object without the disallowed properties. _.omit = function(obj) { var copy = {}; var keys = concat.apply(ArrayProto, slice.call(arguments, 1)); diff --git a/airflow/www/templates/admin/master.html b/airflow/www/templates/admin/master.html index 531fac080bf1f..430097283426c 100644 --- a/airflow/www/templates/admin/master.html +++ b/airflow/www/templates/admin/master.html @@ -23,6 +23,13 @@ + {% endblock %} {% block tail_js %} diff --git a/airflow/www/templates/airflow/chart.html b/airflow/www/templates/airflow/chart.html index 1fc37798ba80e..128364c4ea2dc 100644 --- a/airflow/www/templates/airflow/chart.html +++ b/airflow/www/templates/airflow/chart.html @@ -46,7 +46,7 @@ -
{{ chart |safe }}
+
{{ chart }}

{% endblock %} diff --git a/airflow/www/templates/airflow/code.html b/airflow/www/templates/airflow/code.html index 522c8f42207e6..a91975fd13777 100644 --- a/airflow/www/templates/airflow/code.html +++ b/airflow/www/templates/airflow/code.html @@ -40,6 +40,6 @@
{{ subtitle }}
{% endif %} {% if code_html %} - {{ code_html|safe }} + {{ code_html }} {% endif %} {% endblock %} diff --git a/airflow/www/templates/airflow/config.html b/airflow/www/templates/airflow/config.html index 0aedb17f7a454..f7786be784fda 100644 --- a/airflow/www/templates/airflow/config.html +++ b/airflow/www/templates/airflow/config.html @@ -36,7 +36,7 @@
{{ subtitle }}
{% endif %} {% if code_html %} - {{ code_html|safe }} + {{ code_html }} {% endif %}
diff --git a/airflow/www/templates/airflow/dag.html b/airflow/www/templates/airflow/dag.html index 8db06d740d15d..9474cceb5e13c 100644 --- a/airflow/www/templates/airflow/dag.html +++ b/airflow/www/templates/airflow/dag.html @@ -35,7 +35,8 @@

SUBDAG: {{ dag.dag_id }} {% else %} - DAG: {{ dag.dag_id }} {{ dag.description_unicode }} + DAG: {{ dag.dag_id }} + {{ dag.description_unicode[0:150] + '...' if dag.description_unicode and dag.description_unicode|length > 150 else dag.description_unicode|default('', true) }} {% endif %} {% if root %} ROOT: {{ root }} @@ -372,8 +373,8 @@

{{ title }}

- {{ html_code|safe }} + {{ html_code }} {% endblock %} {% block tail %} diff --git a/airflow/www/templates/airflow/dags.html b/airflow/www/templates/airflow/dags.html index 5c3806d56700a..524b67fc3e7cc 100644 --- a/airflow/www/templates/airflow/dags.html +++ b/airflow/www/templates/airflow/dags.html @@ -81,8 +81,9 @@

DAGs

- - {{ dag.dag_id }} + + {{ dag.dag_id }} @@ -216,6 +217,7 @@

DAGs

const DAGS_INDEX = "{{ url_for('admin.index') }}"; const ENTER_KEY_CODE = 13; + const STATE_COLOR = {{ state_color|tojson }}; $('#dag_query').on('keypress', function (e) { // check for key press on ENTER (key code 13) to trigger the search @@ -314,76 +316,81 @@

DAGs

} }); }); + + function drawDagStatsForDag(dag_id, stats) { + g = d3.select('svg#dag-run-' + dag_id.replace(/\./g, '__dot__')) + .attr('height', diameter + (stroke_width_hover * 2)) + .attr('width', '110px') + .selectAll("g") + .data(states) + .enter() + .append('g') + .attr('transform', function(d, i) { + x = (i * (diameter + circle_margin)) + (diameter/2 + circle_margin); + y = (diameter/2) + stroke_width_hover; + return 'translate(' + x + ',' + y + ')'; + }); + + g.append('text') + .attr('fill', 'black') + .attr('text-anchor', 'middle') + .attr('vertical-align', 'middle') + .attr('font-size', 8) + .attr('y', 3) + .text(function(d){ return d.count > 0 ? d.count : ''; }); + + g.append('circle') + .attr('stroke-width', function(d) { + if (d.count > 0) + return stroke_width; + else { + return 1; + } + }) + .attr('stroke', function(d) { + if (d.count > 0) + return STATE_COLOR[d.state]; + else { + return 'gainsboro'; + } + }) + .attr('fill-opacity', 0) + .attr('r', diameter/2) + .attr('title', function(d) {return d.state || 'none'}) + .attr('style', function(d) { + if (d.count > 0) + return"cursor:pointer;" + }) + .on('click', function(d, i) { + if (d.count > 0) + window.location = "{{ url_for('dagrun.index_view') }}?flt1_dag_id_equals=" + dag_id + "&flt2_state_equals=" + d.state; + }) + .on('mouseover', function(d, i) { + if (d.count > 0) { + d3.select(this).transition().duration(400) + .attr('fill-opacity', 0.3) + .style("stroke-width", stroke_width_hover); + } + }) + .on('mouseout', function(d, i) { + if (d.count > 0) { + d3.select(this).transition().duration(400) + .attr('fill-opacity', 0) + .style("stroke-width", stroke_width); + } + }) + .style("opacity", 0) + .transition() + .duration(500) + .delay(function(d, i){return i*50;}) + .style("opacity", 1); + d3.select("#loading").remove(); + } + d3.json("{{ url_for('airflow.dag_stats') }}", function(error, json) { for(var dag_id in json) { states = json[dag_id]; - g = d3.select('svg#dag-run-' + dag_id.replace(/\./g, '__dot__')) - .attr('height', diameter + (stroke_width_hover * 2)) - .attr('width', '110px') - .selectAll("g") - .data(states) - .enter() - .append('g') - .attr('transform', function(d, i) { - x = (i * (diameter + circle_margin)) + (diameter/2 + circle_margin); - y = (diameter/2) + stroke_width_hover; - return 'translate(' + x + ',' + y + ')'; - }); - - g.append('text') - .attr('fill', 'black') - .attr('text-anchor', 'middle') - .attr('vertical-align', 'middle') - .attr('font-size', 8) - .attr('y', 3) - .text(function(d){ return d.count > 0 ? d.count : ''; }); - - g.append('circle') - .attr('stroke-width', function(d) { - if (d.count > 0) - return stroke_width; - else { - return 1; - } - }) - .attr('stroke', function(d) { - if (d.count > 0) - return d.color; - else { - return 'gainsboro'; - } - }) - .attr('fill-opacity', 0) - .attr('r', diameter/2) - .attr('title', function(d) {return d.state || 'none'}) - .attr('style', function(d) { - if (d.count > 0) - return"cursor:pointer;" - }) - .on('click', function(d, i) { - if (d.count > 0) - window.location = "{{ url_for('dagrun.index_view') }}?flt1_dag_id_equals=" + d.dag_id + "&flt2_state_equals=" + d.state; - }) - .on('mouseover', function(d, i) { - if (d.count > 0) { - d3.select(this).transition().duration(400) - .attr('fill-opacity', 0.3) - .style("stroke-width", stroke_width_hover); - } - }) - .on('mouseout', function(d, i) { - if (d.count > 0) { - d3.select(this).transition().duration(400) - .attr('fill-opacity', 0) - .style("stroke-width", stroke_width); - } - }) - .style("opacity", 0) - .transition() - .duration(500) - .delay(function(d, i){return i*50;}) - .style("opacity", 1); - d3.select("#loading").remove(); + drawDagStatsForDag(dag_id, states); } $("#pause_header").tooltip(); $("#statuses_info").tooltip(); @@ -393,76 +400,81 @@

DAGs

container: "body", }); }); + + function drawTaskStatsForDag(dag_id, states) { + g = d3.select('svg#task-run-' + dag_id.replace(/\./g, '__dot__')) + .attr('height', diameter + (stroke_width_hover * 2)) + .attr('width', '300px') + .selectAll("g") + .data(states) + .enter() + .append('g') + .attr('transform', function(d, i) { + x = (i * (diameter + circle_margin)) + (diameter/2 + circle_margin); + y = (diameter/2) + stroke_width_hover; + return 'translate(' + x + ',' + y + ')'; + }); + + g.append('text') + .attr('fill', 'black') + .attr('text-anchor', 'middle') + .attr('vertical-align', 'middle') + .attr('font-size', 8) + .attr('y', 3) + .text(function(d){ return d.count > 0 ? d.count : ''; }); + + g.append('circle') + .attr('stroke-width', function(d) { + if (d.count > 0) + return stroke_width; + else { + return 1; + } + }) + .attr('stroke', function(d) { + if (d.count > 0) + return STATE_COLOR[d.state]; + else { + return 'gainsboro'; + } + }) + .attr('fill-opacity', 0) + .attr('r', diameter/2) + .attr('title', function(d) {return d.state}) + .attr('style', function(d) { + if (d.count > 0) + return"cursor:pointer;" + }) + .on('click', function(d, i) { + if (d.count > 0) + window.location = "{{ url_for('taskinstance.index_view') }}?flt1_dag_id_equals=" + dag_id + "&flt2_state_equals=" + d.state; + }) + .on('mouseover', function(d, i) { + if (d.count > 0) { + d3.select(this).transition().duration(400) + .attr('fill-opacity', 0.3) + .style("stroke-width", stroke_width_hover); + } + }) + .on('mouseout', function(d, i) { + if (d.count > 0) { + d3.select(this).transition().duration(400) + .attr('fill-opacity', 0) + .style("stroke-width", stroke_width); + } + }) + .style("opacity", 0) + .transition() + .duration(500) + .delay(function(d, i){return i*50;}) + .style("opacity", 1); + d3.select("#loading").remove(); + } + d3.json("{{ url_for('airflow.task_stats') }}?dag_ids=" + (encoded_dag_ids.join(',')), function(error, json) { for(var dag_id in json) { states = json[dag_id]; - g = d3.select('svg#task-run-' + dag_id.replace(/\./g, '__dot__')) - .attr('height', diameter + (stroke_width_hover * 2)) - .attr('width', '300px') - .selectAll("g") - .data(states) - .enter() - .append('g') - .attr('transform', function(d, i) { - x = (i * (diameter + circle_margin)) + (diameter/2 + circle_margin); - y = (diameter/2) + stroke_width_hover; - return 'translate(' + x + ',' + y + ')'; - }); - - g.append('text') - .attr('fill', 'black') - .attr('text-anchor', 'middle') - .attr('vertical-align', 'middle') - .attr('font-size', 8) - .attr('y', 3) - .text(function(d){ return d.count > 0 ? d.count : ''; }); - - g.append('circle') - .attr('stroke-width', function(d) { - if (d.count > 0) - return stroke_width; - else { - return 1; - } - }) - .attr('stroke', function(d) { - if (d.count > 0) - return d.color; - else { - return 'gainsboro'; - } - }) - .attr('fill-opacity', 0) - .attr('r', diameter/2) - .attr('title', function(d) {return d.state}) - .attr('style', function(d) { - if (d.count > 0) - return"cursor:pointer;" - }) - .on('click', function(d, i) { - if (d.count > 0) - window.location = "{{ url_for('taskinstance.index_view') }}?flt1_dag_id_equals=" + d.dag_id + "&flt2_state_equals=" + d.state; - }) - .on('mouseover', function(d, i) { - if (d.count > 0) { - d3.select(this).transition().duration(400) - .attr('fill-opacity', 0.3) - .style("stroke-width", stroke_width_hover); - } - }) - .on('mouseout', function(d, i) { - if (d.count > 0) { - d3.select(this).transition().duration(400) - .attr('fill-opacity', 0) - .style("stroke-width", stroke_width); - } - }) - .style("opacity", 0) - .transition() - .duration(500) - .delay(function(d, i){return i*50;}) - .style("opacity", 1); - d3.select("#loading").remove(); + drawTaskStatsForDag(dag_id, states); } $("#pause_header").tooltip(); $("#statuses_info").tooltip(); diff --git a/airflow/www/templates/airflow/duration_chart.html b/airflow/www/templates/airflow/duration_chart.html index 392dc17b40f13..ff5d8ab60b8bf 100644 --- a/airflow/www/templates/airflow/duration_chart.html +++ b/airflow/www/templates/airflow/duration_chart.html @@ -50,8 +50,8 @@ -
{{ chart |safe }}
-
{{ cum_chart | safe}}
+
{{ chart }}
+
{{ cum_chart }}

{% endblock %} diff --git a/airflow/www/templates/airflow/gantt.html b/airflow/www/templates/airflow/gantt.html index a76b38517985a..1889853ab53e1 100644 --- a/airflow/www/templates/airflow/gantt.html +++ b/airflow/www/templates/airflow/gantt.html @@ -24,6 +24,13 @@ + {% endblock %} {% block body %} @@ -33,7 +40,7 @@ Base date: {{ form.base_date(class_="form-control") }} Number of runs: {{ form.num_runs(class_="form-control") }} Run: - {{ form.execution_date(class_="form-control") | safe }} + {{ form.execution_date(class_="form-control") }} @@ -57,7 +64,7 @@ var dag_id = '{{ dag.dag_id }}'; var task_id = ''; var execution_date = ''; - data = {{ data |tojson|safe }}; + data = {{ data |tojson }}; var gantt = d3.gantt() .taskTypes(data.taskNames) .taskStatus(data.taskStatus) diff --git a/airflow/www/templates/airflow/graph.html b/airflow/www/templates/airflow/graph.html index 0535bf85e5477..880c4634a71e5 100644 --- a/airflow/www/templates/airflow/graph.html +++ b/airflow/www/templates/airflow/graph.html @@ -27,12 +27,19 @@ {{ super() }} + {% endblock %} {% block body %} {{ super() }} {% if doc_md %} -
{{ doc_md|safe }}
+{{ doc_md }} {% endif %}
@@ -40,9 +47,9 @@ Base date: {{ form.base_date(class_="form-control") }} Number of runs: {{ form.num_runs(class_="form-control") }} Run: - {{ form.execution_date(class_="form-control") | safe }} + {{ form.execution_date(class_="form-control") }} Layout: - {{ form.arrange(class_="form-control") | safe }} + {{ form.arrange(class_="form-control") }} @@ -64,14 +71,9 @@
no_status
-
queued
-
up_for_retry
-
up_for_reschedule
-
upstream_failed
-
skipped
-
failed
-
running
-
success
+ {% for state, state_color in state_color_mapping.items() %} +
{{state}}
+ {% endfor %}
@@ -107,10 +109,10 @@ var upstream_color = "#2020A0"; var downstream_color = "#0000FF"; - var nodes = {{ nodes|tojson|safe }}; - var edges = {{ edges|tojson|safe }}; - var tasks = {{ tasks|tojson|safe }}; - var task_instances = {{ task_instances|tojson|safe }}; + var nodes = {{ nodes|tojson }}; + var edges = {{ edges|tojson }}; + var tasks = {{ tasks|tojson }}; + var task_instances = {{ task_instances|tojson }}; var execution_date = "{{ execution_date }}"; var arrange = "{{ arrange }}"; var g = dagreD3.json.decode(nodes, edges); diff --git a/airflow/www/templates/airflow/list_dags.html b/airflow/www/templates/airflow/list_dags.html index 3e9fe2934ed50..e9398eb68d79d 100644 --- a/airflow/www/templates/airflow/list_dags.html +++ b/airflow/www/templates/airflow/list_dags.html @@ -216,8 +216,8 @@

DAGs

{% if filter_groups %} var filter = new AdminFilters( '#filter_form', '.field-filters', - {{ filter_groups|tojson|safe }}, - {{ active_filters|tojson|safe }} + {{ filter_groups|tojson }}, + {{ active_filters|tojson }} ); {% endif %} })(jQuery); diff --git a/airflow/www/templates/airflow/nvd3.html b/airflow/www/templates/airflow/nvd3.html index a36f9cbc644e2..b306bbcc39463 100644 --- a/airflow/www/templates/airflow/nvd3.html +++ b/airflow/www/templates/airflow/nvd3.html @@ -142,7 +142,7 @@

{% endblock %} diff --git a/airflow/www_rbac/templates/airflow/duration_chart.html b/airflow/www_rbac/templates/airflow/duration_chart.html index 4955f11572a62..b1b9fa0bbbc8c 100644 --- a/airflow/www_rbac/templates/airflow/duration_chart.html +++ b/airflow/www_rbac/templates/airflow/duration_chart.html @@ -46,8 +46,8 @@ -
{{ chart |safe }}
-
{{ cum_chart | safe}}
+
{{ chart }}
+
{{ cum_chart }}

{% endblock %} diff --git a/airflow/www_rbac/templates/airflow/gantt.html b/airflow/www_rbac/templates/airflow/gantt.html index 7dc53ae214827..c5c4c42fed84e 100644 --- a/airflow/www_rbac/templates/airflow/gantt.html +++ b/airflow/www_rbac/templates/airflow/gantt.html @@ -21,6 +21,13 @@ {{ super() }} + {% endblock %} {% block content %} @@ -30,7 +37,7 @@ Base date: {{ form.base_date(class_="form-control") }} Number of runs: {{ form.num_runs(class_="form-control") }} Run: - {{ form.execution_date(class_="form-control") | safe }} + {{ form.execution_date(class_="form-control") }} @@ -47,15 +54,15 @@ / +