Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions images/pyspark-notebook/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,12 @@ USER ${NB_UID}
# NOTE: It's important to ensure compatibility between Pandas versions.
# The pandas version in this Dockerfile should match the version
# on which the Pandas API for Spark is built.
# To find the right version:
# 1. Check out the Spark branch you are on: <https://github.com/apache/spark>
# 2. Find the pandas version in the file `dev/infra/Dockerfile`.
# To find the right version, check the pandas version being installed here:
# https://github.com/apache/spark/blob/<SPARK_VERSION>/dev/infra/Dockerfile
RUN mamba install --yes \
'grpcio-status' \
'grpcio' \
'pandas=2.2.2' \
'pandas=2.2.3' \
'pyarrow' && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
Expand Down
8 changes: 3 additions & 5 deletions images/pyspark-notebook/setup_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,8 @@ def get_latest_spark_version() -> str:
LOGGER.info("Downloading Spark versions information")
all_refs = get_all_refs("https://archive.apache.org/dist/spark/")
LOGGER.info(f"All refs: {all_refs}")
versions = [
ref.removeprefix("spark-").removesuffix("/")
for ref in all_refs
if re.match(r"^spark-\d", ref) is not None and "incubating" not in ref
]
pattern = re.compile(r"^spark-(\d+\.\d+\.\d+)/$")
versions = [match.group(1) for ref in all_refs if (match := pattern.match(ref))]
LOGGER.info(f"Available versions: {versions}")

# Compare versions semantically
Expand Down Expand Up @@ -74,6 +71,7 @@ def download_spark(
spark_dir_name += f"-scala{scala_version}"
LOGGER.info(f"Spark directory name: {spark_dir_name}")
spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz"
LOGGER.info(f"Spark download URL: {spark_url}")

tmp_file = Path("/tmp/spark.tar.gz")
subprocess.check_call(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# Distributed under the terms of the Modified BSD License.
import pandas

assert pandas.__version__ == "2.2.2"
assert pandas.__version__ == "2.2.3"
2 changes: 1 addition & 1 deletion tests/shared_checks/nbconvert_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def check_nbconvert(
no_warnings: bool = True,
) -> str:
"""Check if nbconvert is able to convert a notebook file"""
cont_data_file = "/home/jovyan/data/" + host_file.name
cont_data_file = "/home/jovyan/" + host_file.name
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For some reason, this test requires the directory where the file is to be writeable now.
I don't have time to dig into why this happens, so I just changed the mounted file location


output_dir = "/tmp"
LOGGER.info(
Expand Down