Skip to content

Commit

Permalink
feat(emr): set PYSPARK_PYTHON to python 3.11 (#13487)
Browse files Browse the repository at this point in the history
  • Loading branch information
hongbo-miao committed Dec 29, 2023
1 parent 0319324 commit 2941b6e
Show file tree
Hide file tree
Showing 8 changed files with 119 additions and 66 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ make kubernetes-clean
- **Node.js** - JavaScript runtime
- **npm** - JavaScript package management

## Database
## Database, Data Warehouse, Data Lakehouse

- **Trino** - Distributed SQL query engine
- **PostgreSQL** - Object-relational database
Expand Down
11 changes: 5 additions & 6 deletions aws/aws-secrets-manager/secrets/hm-iot-rds-secret.json.template
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
{
"engine": "postgres",
"postgres_host": "hm-iot-rds.xxxxxxxxxxxx.us-west-2.rds.amazonaws.com",
"postgres_port": "5432",
"postgres_db": "hm_iot_db",
"postgres_user": "hm_iot_db_readonly",
"postgres_password": "xxx"
"host": "hm-iot-rds.xxxxxxxxxxxx.us-west-2.rds.amazonaws.com",
"port": "5432",
"db": "hm_iot_db",
"user": "hm_iot_db_readonly",
"password": "xxx"
}
4 changes: 2 additions & 2 deletions terraform/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ terraform-update-lock-file:
terraform-reconfigure:
terraform init -reconfigure
terraform-plan:
terraform plan
terraform plan -parallelism=8
terraform-apply:
terraform apply -auto-approve
terraform apply -auto-approve -parallelism=8
terraform-refresh:
terraform refresh
terraform-destroy:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ echo ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPWhR5NV13iw0X8lKqsrSRqbcIJcA5AVMjyfJjO

echo "# Install Python"
# https://github.com/aws-samples/aws-emr-utilities/blob/main/utilities/emr-ec2-custom-python3/README.md
PYTHON_VERSION=3.11.6
# Update the corresponding hm_sedona_emr -> PYSPARK_PYTHON version in terraform/main.tf
PYTHON_VERSION=3.11.7
sudo yum --assumeyes remove openssl-devel*
sudo yum --assumeyes install \
bzip2-devel \
Expand All @@ -33,10 +34,10 @@ echo "# Install dependencies"
sudo curl --silent --fail --show-error --location --remote-name --output-dir /usr/lib/spark/jars/ https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-3.4_2.12/1.5.0/sedona-spark-shaded-3.4_2.12-1.5.0.jar
sudo curl --silent --fail --show-error --location --remote-name --output-dir /usr/lib/spark/jars/ https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/1.5.0-28.2/geotools-wrapper-1.5.0-28.2.jar
"/usr/local/python${PYTHON_VERSION}/bin/python${PYTHON_VERSION%.*}" -m pip install \
apache-sedona==1.5.0 \
apache-sedona[spark]==1.5.0 \
attrs==23.1.0 \
descartes==1.1.0 \
geopandas==0.14.1 \
matplotlib==3.8.2 \
pandas==2.1.3 \
pandas==2.1.4 \
shapely==2.0.2
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import logging
import sys

from pyspark.sql import SparkSession


def main() -> None:
SparkSession.builder.getOrCreate()
logging.info(sys.version_info)
assert (sys.version_info.major, sys.version_info.minor) == (3, 11)


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
main()
129 changes: 82 additions & 47 deletions terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -57,31 +57,33 @@ module "hm_trino" {
core_instance_type = "r7a.2xlarge"
core_target_on_demand_capacity = 1
bootstrap_set_up_script_s3_uri = module.hm_trino_s3_set_up_script.uri
configurations = [
{
Classification : "delta-defaults",
Properties : {
"delta.enabled" : "true"
}
},
{
Classification : "trino-connector-delta",
Properties : {
"hive.metastore" : "glue"
}
},
{
Classification : "trino-connector-postgresql",
Properties : {
connection-url : "jdbc:postgresql://${jsondecode(data.aws_secretsmanager_secret_version.hm_rds_secret_version.secret_string)["postgres_host"]}:${jsondecode(data.aws_secretsmanager_secret_version.hm_rds_secret_version.secret_string)["postgres_port"]}/${jsondecode(data.aws_secretsmanager_secret_version.hm_rds_secret_version.secret_string)["postgres_db"]}",
connection-user : jsondecode(data.aws_secretsmanager_secret_version.hm_rds_secret_version.secret_string)["postgres_user"],
connection-password : jsondecode(data.aws_secretsmanager_secret_version.hm_rds_secret_version.secret_string)["postgres_password"]
configurations_json_string = <<EOF
[
{
"Classification": "delta-defaults",
"Properties": {
"delta.enabled": "true"
}
},
{
"Classification": "trino-connector-delta",
"Properties": {
"hive.metastore": "glue"
}
},
{
"Classification": "trino-connector-postgresql",
"Properties": {
"connection-url": "jdbc:postgresql://${jsondecode(data.aws_secretsmanager_secret_version.hm_rds_secret_version.secret_string)["host"]}:${jsondecode(data.aws_secretsmanager_secret_version.hm_rds_secret_version.secret_string)["port"]}/${jsondecode(data.aws_secretsmanager_secret_version.hm_rds_secret_version.secret_string)["db"]}",
"connection-user": "${jsondecode(data.aws_secretsmanager_secret_version.hm_rds_secret_version.secret_string)["user"]}",
"connection-password": "${jsondecode(data.aws_secretsmanager_secret_version.hm_rds_secret_version.secret_string)["password"]}"
}
}
}
]
iam_role_arn = "arn:aws:iam::272394222652:role/service-role/AmazonEMR-ServiceRole-hm"
environment = var.environment
team = var.team
]
EOF
iam_role_arn = "arn:aws:iam::272394222652:role/service-role/AmazonEMR-ServiceRole-hm"
environment = var.environment
team = var.team
}
module "hm_trino_task_instance_fleet" {
source = "./modules/hm_amazon_emr_cluster_task_instance_fleet"
Expand All @@ -108,6 +110,12 @@ module "hm_sedona_s3_set_up_script" {
amazon_s3_key = "amazon-emr/clusters/hm-amazon-emr-cluster-sedona/bootstrap-actions/set_up.sh"
local_file_path = "./data/amazon-emr/hm-amazon-emr-cluster-sedona/bootstrap-actions/set_up.sh"
}
module "hm_sedona_s3_validate_python_version_script" {
source = "./modules/hm_amazon_s3_object"
amazon_s3_bucket = "hongbomiao-bucket"
amazon_s3_key = "amazon-emr/clusters/hm-amazon-emr-cluster-sedona/steps/validate_python_version.py"
local_file_path = "./data/amazon-emr/hm-amazon-emr-cluster-sedona/steps/validate_python_version.py"
}
module "hm_sedona_emr" {
source = "./modules/hm_amazon_emr_cluster"
amazon_emr_cluster_name = "hm-sedona"
Expand All @@ -117,32 +125,59 @@ module "hm_sedona_emr" {
core_instance_type = "r7a.2xlarge"
core_target_on_demand_capacity = 1
bootstrap_set_up_script_s3_uri = module.hm_sedona_s3_set_up_script.uri
configurations = [
{
Classification : "delta-defaults",
Properties : {
"delta.enabled" : "true"
}
},
steps = [
{
"Classification" : "spark-hive-site",
"Properties" : {
"hive.metastore.client.factory.class" : "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
}
},
{
"Classification" : "spark-defaults",
"Properties" : {
"spark.yarn.dist.jars" : "/usr/lib/spark/jars/sedona-spark-shaded-3.4_2.12-1.5.0.jar,/usr/lib/spark/jars/geotools-wrapper-1.5.0-28.2.jar",
"spark.serializer" : "org.apache.spark.serializer.KryoSerializer",
"spark.kryo.registrator" : "org.apache.sedona.core.serde.SedonaKryoRegistrator",
"spark.sql.extensions" : "org.apache.sedona.viz.sql.SedonaVizExtensions,org.apache.sedona.sql.SedonaSqlExtensions"
}
name = "Validate Python Version"
action_on_failure = "CONTINUE"
hadoop_jar_step = [
{
jar = "command-runner.jar"
args = ["spark-submit", "--deploy-mode", "client", module.hm_sedona_s3_validate_python_version_script.uri]
main_class = ""
properties = {}
}
]
}
]
iam_role_arn = "arn:aws:iam::272394222652:role/service-role/AmazonEMR-ServiceRole-hm"
environment = var.environment
team = var.team
configurations_json_string = <<EOF
[
{
"Classification" : "delta-defaults",
"Properties" : {
"delta.enabled" : "true"
}
},
{
"Classification" : "spark-hive-site",
"Properties" : {
"hive.metastore.client.factory.class" : "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
}
},
{
"Classification" : "spark-defaults",
"Properties" : {
"spark.yarn.dist.jars" : "/usr/lib/spark/jars/sedona-spark-shaded-3.4_2.12-1.5.0.jar,/usr/lib/spark/jars/geotools-wrapper-1.5.0-28.2.jar",
"spark.serializer" : "org.apache.spark.serializer.KryoSerializer",
"spark.kryo.registrator" : "org.apache.sedona.core.serde.SedonaKryoRegistrator",
"spark.sql.extensions" : "org.apache.sedona.viz.sql.SedonaVizExtensions,org.apache.sedona.sql.SedonaSqlExtensions"
}
},
{
"Classification": "spark-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"PYSPARK_PYTHON": "/usr/local/python3.11.7/bin/python3.11"
}
}
]
}
]
EOF
iam_role_arn = "arn:aws:iam::272394222652:role/service-role/AmazonEMR-ServiceRole-hm"
environment = var.environment
team = var.team
}
module "hm_sedona_emr_task_instance_fleet" {
source = "./modules/hm_amazon_emr_cluster_task_instance_fleet"
Expand Down
6 changes: 4 additions & 2 deletions terraform/modules/hm_amazon_emr_cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ resource "aws_emr_cluster" "hm_amazon_emr_cluster" {
name = "set_up"
path = var.bootstrap_set_up_script_s3_uri
}
configurations_json = jsonencode(var.configurations)
configurations_json = var.configurations_json_string
step = var.steps
service_role = var.iam_role_arn
tags = {
for-use-with-amazon-emr-managed-policies = true
Expand All @@ -56,7 +57,8 @@ resource "aws_emr_cluster" "hm_amazon_emr_cluster" {
# https://github.com/hashicorp/terraform-provider-aws/issues/12683#issuecomment-752899019
lifecycle {
ignore_changes = [
configurations_json
configurations_json,
step
]
}
}
11 changes: 6 additions & 5 deletions terraform/modules/hm_amazon_emr_cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ variable "core_target_on_demand_capacity" {
variable "bootstrap_set_up_script_s3_uri" {
type = string
}
variable "configurations" {
type = list(object({
Classification = string
Properties = map(string)
}))
variable "configurations_json_string" {
type = string
}
variable "steps" {
type = list(any)
default = []
}
variable "iam_role_arn" {
type = string
Expand Down

0 comments on commit 2941b6e

Please sign in to comment.