# Analyze maturity area data set

## Objectives
1. Load maturity data sets
2. Cross-reference tables by their primary keys
3. Analyze the cross-referenced table


## Preamble

In [0]:
import os
import mlflow
import pandas as pd

In [0]:
experiment_name = "/Lab_Project_FruitDropZGS/mlflow_experiment/analyze_maturity_area_data_set_experiment"
mlflow.set_experiment(experiment_name)

In [0]:
run_mlflow = mlflow.start_run()

local_tmp_artifact_dir_path = "/Workspace/Lab_Project_FruitDropZGS/notebook_artifacts/tmp/" + run_mlflow.info.run_id + "/artifacts/"

if not os.path.exists(local_tmp_artifact_dir_path):
  os.makedirs(local_tmp_artifact_dir_path)
  
local_tmp_artifact_dir_path

## Configure parameters

In [0]:
from maturity_area_data_sets import maturity_area_data_sets

params = maturity_area_data_sets

mlflow.log_params(params)

params

## Load data

In [0]:
from maturity_area_orchard_expected_schema import maturity_area_orchard_expected_schema

maturity_area_orchard_expected_schema

In [0]:
maturity_area_orchard_sdf = spark.table(params["maturity_area_orchard"])

maturity_area_orchard_count_sdf = maturity_area_orchard_sdf.count()
mlflow.log_metric("maturity_area_orchard_count", maturity_area_orchard_count_sdf)

maturity_area_orchard_sdf.printSchema()
maturity_area_orchard_sdf.show()
maturity_area_orchard_count_sdf

In [0]:
assert maturity_area_orchard_expected_schema == maturity_area_orchard_sdf.schema

In [0]:
from maturity_area_seasonal_expected_schema import maturity_area_seasonal_expected_schema

maturity_area_seasonal_expected_schema

In [0]:
maturity_area_seasonal_sdf = spark.table(params["maturity_area_orchard_seasonal"])

maturity_area_seasonal_count_sdf = maturity_area_seasonal_sdf.count()
mlflow.log_metric("maturity_area_seasonal_count", maturity_area_seasonal_count_sdf)

maturity_area_seasonal_sdf.printSchema()
maturity_area_seasonal_sdf.show()
maturity_area_seasonal_count_sdf

In [0]:
assert maturity_area_seasonal_expected_schema == maturity_area_seasonal_sdf.schema

In [0]:
from joint_maturity_area_expected_schema import joint_maturity_area_expected_schema

joint_maturity_area_expected_schema

In [0]:
joined_maturity_area_sdf = maturity_area_orchard_sdf.join(maturity_area_seasonal_sdf, ["Maturity_Area", "KPIN"])

joined_maturity_area_sdf.printSchema()
joined_maturity_area_sdf.show()
joined_maturity_area_sdf.count()

In [0]:
assert joint_maturity_area_expected_schema == joined_maturity_area_sdf.schema

In [0]:
joined_maturity_area_summary_sdf = joined_maturity_area_sdf.summary()

joined_maturity_area_summary_sdf.show()

In [0]:
joined_maturity_area_summary_pdf = joined_maturity_area_summary_sdf.toPandas()
joined_maturity_area_summary_pdf.to_csv(local_tmp_artifact_dir_path + "joined_maturity_area_summary.csv", index=False)

## Write joint maturity area data set to Databrick Catalog

In [0]:
target_table_name = "datalake_lab.lab_project_fruitdropzgs.joined_maturity_area_data_set"
target_table_sdf = spark.table(target_table_name)
target_table_sdf.printSchema()
target_table_sdf.show()

In [0]:
if target_table_sdf.exceptAll(joined_maturity_area_sdf).count() == 0 and joined_maturity_area_sdf.exceptAll(target_table_sdf).count() == 0:
  print("The dataframes are equal.")
else:
  joined_maturity_area_sdf.write.mode("overwrite").saveAsTable(target_table_name)
  print("The Datarames are not equal. Overwrite the new version of table")

In [0]:
spark.sql(f"""DESCRIBE HISTORY {target_table_name}""").show()

## Wrap up

In [0]:
mlflow.log_artifacts(local_tmp_artifact_dir_path)

In [0]:
# Remove tmp directory file
tmp_dir = os.path.dirname(os.path.dirname(local_tmp_artifact_dir_path))
dbutils.fs.rm("file:" + tmp_dir, recurse=True)

In [0]:
# End MLflow run
mlflow.end_run()