# Analyze leaf analysis data set

## Method

- Load leaf analysis data set        
- Check data set schema
- Compute statistical distribution for leaf analysis
- Compute distribution of leaf analysis across sample time

## Preamble

In [0]:
import os
import pandas as pd
import mlflow
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when

In [0]:
experiment_name = "/Lab_Project_FruitDropZGS/mlflow_experiment/analyze_leaf_analysis_data_set_experiment"
mlflow.set_experiment(experiment_name)

In [0]:
run_mlflow = mlflow.start_run()

local_tmp_artifact_dir_path = f"/Workspace/Lab_Project_FruitDropZGS/notebook_artifacts/tmp/{run_mlflow.info.run_id}/artifacts/"
if not os.path.exists(local_tmp_artifact_dir_path):
    os.makedirs(local_tmp_artifact_dir_path)
      
local_tmp_artifact_dir_path

## Configure parameters

In [0]:
from leaf_analysis_params_all_orchards import params_all_orchards
from leaf_analysis_params_selected_kpin import params_selected_kpin

params = params_all_orchards
# params = params_selected_kpin

mlflow.log_params(params)

params

In [0]:
if params["select_kpin"]:
  filter_kpin = col("KPIN").isin(params["select_kpin"])
else:
  filter_kpin= col("KPIN").isNotNull() | col("KPIN").isNull()

filter_conditions = filter_kpin
filter_conditions

## Load data

In [0]:
from leaf_analysis_expected_schema import expected_schema

expected_schema

In [0]:
initial_sdf = spark.table(params["table_name"])

leaf_sdf = initial_sdf.filter(filter_conditions) 

leaf_sdf_count = leaf_sdf.count()
mlflow.log_metric("leaf_sdf_count", leaf_sdf_count)
  
leaf_sdf.printSchema()
leaf_sdf.show()
leaf_sdf_count

In [0]:
assert expected_schema == initial_sdf.schema

## Distribution of data analysis by KPIN and maturity area

In [0]:
leaf_data_analysis_count_sdf = leaf_sdf.groupBy(
  "KPIN",
  "MA",
  "data analysis"
).count().orderBy("data analysis")

leaf_data_analysis_count_sdf.show()
leaf_data_analysis_count_sdf.count()

In [0]:
leaf_data_analysis_count_summary_sdf = leaf_data_analysis_count_sdf.select("count").summary()
leaf_data_analysis_count_summary_sdf.show()

### Count distinct data analysis by KPIN and maturity area

In [0]:
leaf_data_analysis_distinct_count_sdf = leaf_data_analysis_count_sdf.groupBy(
  "KPIN",
  "MA",
).count().withColumnRenamed("count", "count_distinct_data_analysis").orderBy("count_distinct_data_analysis", ascending=False)

leaf_data_analysis_distinct_count_pdf = leaf_data_analysis_distinct_count_sdf.toPandas()

leaf_data_analysis_distinct_count_pdf

In [0]:
leaf_data_analysis_distinct_count_pdf.to_csv(local_tmp_artifact_dir_path + 'leaf_data_analysis_distinct_count.csv', index=False)

## Compute statistical distribution for leaf analysis

In [0]:
leaf_analysis_summary_sdf = leaf_sdf.select(
  "N %",
  "P %",
  "K %",
  "Ca %",
  "Mg %",
  "Na %",
  "Fe ppm",
  "Mn ppm",
  "Cu ppm",
  "Zn ppm",
  "B ppm"
).summary()

leaf_analysis_summary_pdf = leaf_analysis_summary_sdf.toPandas()
leaf_analysis_summary_pdf.info()
leaf_analysis_summary_pdf

In [0]:
leaf_analysis_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'leaf_analysis_summary.csv', index=False)

## Boxplot analysis of distribution of leaf analysis across sample time by KPIN and maturity area

In [0]:
leaf_analysis_by_kpin_maturity_area_sdf = leaf_sdf.select(
  "KPIN",
  "MA",
  "Sample time",
  "N %",
  "P %",
  "K %",
  "Ca %",
  "Mg %",
  "Na %",
  "Fe ppm",
  "Mn ppm",
  "Cu ppm",
  "Zn ppm",
  "B ppm"
)

leaf_analysis_by_kpin_maturity_area_sdf.printSchema()
leaf_analysis_by_kpin_maturity_area_sdf.show()
leaf_analysis_by_kpin_maturity_area_sdf.count()

### Result: boxplot analysis of distribution of leaf analysis across sample time by KPIN and maturity area

In [0]:
leaf_analysis_by_kpin_maturity_area_pdf = leaf_analysis_by_kpin_maturity_area_sdf.toPandas()

leaf_analysis_by_kpin_maturity_area_pdf["KPIN_MA"] = leaf_analysis_by_kpin_maturity_area_pdf["KPIN"].astype(str) + "_" + leaf_analysis_by_kpin_maturity_area_pdf["MA"]

leaf_analysis_by_kpin_maturity_area_pdf = leaf_analysis_by_kpin_maturity_area_pdf.sort_values(by="KPIN_MA", ascending=False)

leaf_analysis = leaf_analysis_by_kpin_maturity_area_pdf.iloc[:, 4: -1]

for analysis in leaf_analysis:
  fig, ax = plt.subplots(figsize=(16, 22))
  sns.boxplot(x=analysis, y="KPIN_MA", hue="Sample time", data=leaf_analysis_by_kpin_maturity_area_pdf, orient="h",ax=ax)
  ax.set_title(f"Fruit {analysis} by KPIN and MA")
  ax.grid()
  plt.show()
  fig.savefig(local_tmp_artifact_dir_path + f"analysis_{analysis}_across_sample_time_by_kpin_maturity_area.png")

## Wrap up

In [0]:
mlflow.log_artifacts(local_tmp_artifact_dir_path)

In [0]:
# Remove tmp directory file
tmp_dir = os.path.dirname(os.path.dirname(local_tmp_artifact_dir_path))
dbutils.fs.rm("file:" + tmp_dir, recurse=True)

In [0]:
# End MLflow run
mlflow.end_run() 