## Migrating Oozie Workflows to Airflow CDE DAGs

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!pip install pandas
!pip install requests_toolbelt
!pip install xmltodict

In [None]:
import migration_utility.workflow as workflow
import migration_utility.cdejob as cdejob
import migration_utility.cderesource as cderesource
import os

In [None]:
project_input_dir = 'input'
project_output_dir = 'output'

dag_file = 'combined.py'
dag_name = 'combined'

cde_prefix = 'jmontenaro_combined'
cde_resource_name = 'resource'
cde_job_name = 'job'

hive_connection = 'default-hive-aws'

In [None]:
os.environ["WORKLOAD_USER"] = ""
os.environ["WORKLOAD_PASSWORD"] = ""
os.environ["JOBS_API_URL"] = ""

In [None]:
!mkdir -p {project_output_dir}

In [None]:
ow = workflow.OozieWorkflow(project_input_dir)

In [None]:
workflow_dict = ow.create_workflow_dict()
workflow_dict

In [None]:
workflow_props = ow.create_workflow_props()
workflow_props

In [None]:
workflow_dict = ow.replace_workflow_props(workflow_dict, workflow_props)
workflow_dict

In [None]:
cj = cdejob.CDEJob(workflow_dict, hive_connection, cde_prefix, cde_resource_name, dag_name)

In [None]:
cj.initialize_dag(project_output_dir, dag_file)
cj.dag_imports(project_output_dir, dag_file)
cj.dag_declaration('jmontenaro', project_output_dir, dag_file)

In [None]:
spark_payloads = cj.parse_oozie_workflow(project_output_dir, dag_file, workflow_dict)
spark_payloads

In [None]:
with open(project_output_dir + "/" + dag_file, 'r') as f:
    print(f.read())

In [None]:
airflow_cde_payload = cj.oozie_to_cde_airflow_payload(dag_file, cde_resource_name, cde_job_name)
airflow_cde_payload

In [None]:
cr = cderesource.CDEResource(os.environ["JOBS_API_URL"], os.environ["WORKLOAD_USER"], cde_prefix, cde_resource_name)

In [None]:
token = cr.set_cde_token(os.environ["WORKLOAD_PASSWORD"])

In [None]:
cr.create_cde_resource(token, cde_resource_name)

In [None]:
cr.upload_file(cde_resource_name, project_input_dir, "pi.scala", token)

In [None]:
for spark_payload in spark_payloads:
    cr.create_job_from_resource(token, spark_payload)

In [None]:
cr.upload_file(cde_resource_name, project_output_dir, dag_file, token)

In [None]:
cr.create_job_from_resource(token, airflow_cde_payload)