# Analyze leaf area index data set

## Method

1. Load leaf area index data set
2. Check data set schema
3. Compute statistical distribution for data set metrics

## Preamble

In [0]:
import os
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import col
from pyspark.sql import functions as F

In [0]:
experiment_name = "/Lab_Project_FruitDropZGS/mlflow_experiment/analyze_leaf_area_index_data_set_experiment"
mlflow.set_experiment(experiment_name)

In [0]:
run_mlflow = mlflow.start_run()

local_tmp_artifact_dir_path = f"/Workspace/Lab_Project_FruitDropZGS/notebook_artifacts/tmp/{run_mlflow.info.run_id}/artifacts/"
if not os.path.exists(local_tmp_artifact_dir_path):
    os.makedirs(local_tmp_artifact_dir_path)
      
local_tmp_artifact_dir_path

## Configure parameters

In [0]:
from leaf_area_index_params import *

params = params_all_orchards
# params = params_select_KPIN

mlflow.log_params(params)

params

In [0]:
if params["KPIN"]:
  kpin_condition = col("KPIN").isin(params["KPIN"])
  mlflow.log_params({"KPIN": params["KPIN"]})
else:
  kpin_condition = col("KPIN").isNotNull() | col("KPIN").isNull()

## Load data

### Load leaf area index schema

In [0]:
from leaf_area_index_schema import leaf_area_index_schema

leaf_area_index_schema

In [0]:
initial_sdf = spark.table(params["table_name"])
    
lai_data_sdf = initial_sdf.filter(kpin_condition).cache()

lai_data_count_sdf = lai_data_sdf.count()
mlflow.log_metric("lai_data_count",lai_data_count_sdf)

lai_data_sdf.printSchema()

In [0]:
lai_data_sdf.show()
lai_data_count_sdf

In [0]:
assert leaf_area_index_schema == lai_data_sdf.schema

## Compute statistical distribution for dimensions

In [0]:
lai_data_dimension_summary_sdf = lai_data_sdf.select(
  "Grower",
  "KPIN",
  "MaturityArea",
  "Bay",
).summary("count", "min", "25%", "50%", "75%", "max")

lai_data_dimension_summary_pdf = lai_data_dimension_summary_sdf.toPandas()
lai_data_dimension_summary_pdf

In [0]:
lai_data_dimension_summary_pdf.reset_index().to_csv(local_tmp_artifact_dir_path + 'lai_data_dimension_summary.csv', index=False)

## Compute statistical distribution for metrics

In [0]:
lai_data_metrics_sdf = lai_data_sdf.drop(
  "Grower",
  "KPIN",
  "MaturityArea",
  "Bay",
  "Units"
).summary()

lai_data_metrics_pdf = lai_data_metrics_sdf.toPandas()
lai_data_metrics_pdf

In [0]:
lai_data_metrics_pdf.reset_index().to_csv(local_tmp_artifact_dir_path + 'lai_data_metrics_summary.csv', index=False)

## Distribution of date by KPIN and maturity area

In [0]:
lai_data_kpin_maturity_area_date_sdf = lai_data_sdf.groupBy(
  "KPIN",
  "MaturityArea",
  "Date",
).count().withColumnRenamed("count", "count_date").orderBy("KPIN", "MaturityArea", "Date")

lai_data_kpin_maturity_area_date_sdf.printSchema()
lai_data_kpin_maturity_area_date_sdf.show()
lai_data_kpin_maturity_area_date_sdf.count()

In [0]:
lai_data_kpin_maturity_area_date_summary_sdf = lai_data_kpin_maturity_area_date_sdf.select("count_date").summary()
lai_data_kpin_maturity_area_date_summary_sdf.show()

In [0]:
lai_data_kpin_maturity_area_date_summary_pdf = lai_data_kpin_maturity_area_date_summary_sdf.toPandas()
lai_data_kpin_maturity_area_date_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'lai_data_kpin_maturity_area_date_summary.csv', index=False)

### Count of distinct date by KPIN and maturity area

In [0]:
lai_data_kpin_maturity_area_distinct_date_sdf = lai_data_kpin_maturity_area_date_sdf.groupBy(
  "KPIN",
  "MaturityArea"
).count().withColumnRenamed("count", "count_distinct_date").orderBy("count_distinct_date", ascending=False)

lai_data_kpin_maturity_area_distinct_date_pdf = lai_data_kpin_maturity_area_distinct_date_sdf.toPandas()

lai_data_kpin_maturity_area_distinct_date_pdf

In [0]:
lai_data_kpin_maturity_area_distinct_date_pdf.to_csv(local_tmp_artifact_dir_path + 'lai_data_kpin_maturity_area_distinct_date.csv', index=False)

## Distribution of average leaf area index across date by KPIN, maturity area and bay

In [0]:
lai_data_across_date_sdf = lai_data_sdf.groupBy(
  "Date",
  "KPIN",
  "MaturityArea",
  "Bay",
).agg(
  F.max("LAIaverage").alias("LAIaverage")
).orderBy("KPIN", ascending= True)

lai_data_across_date_sdf.printSchema()
lai_data_across_date_sdf.show()
lai_data_across_date_sdf.count()

### Result: Distribution of average leaf area index across date by KPIN, maturity area and bay

In [0]:
lai_data_across_date_pdf = lai_data_across_date_sdf.toPandas()

lai_data_across_date_pdf["KPIN_maturity_area_bay"] = lai_data_across_date_pdf["KPIN"].astype(str) + "_" + lai_data_across_date_pdf["MaturityArea"].astype(str) + "_" + lai_data_across_date_pdf["Bay"].astype(str)

lai_data_across_date_pdf = lai_data_across_date_pdf.sort_values(by="KPIN_maturity_area_bay", ascending=False)

fig, ax = plt.subplots(figsize=(18,26))

sns.boxplot(x="LAIaverage", y="KPIN_maturity_area_bay", data=lai_data_across_date_pdf, orient="h", ax=ax)

ax.set_title(f"Distribution of LAIaverage across date by KPIN, maturity area and bay")

ax.grid(True)

fig.savefig(local_tmp_artifact_dir_path + f"distribution_LAIaverage_across_date_by_kpin_maturity_area_bay.png")
plt.show()

## Wrap up

In [0]:
mlflow.log_artifacts(local_tmp_artifact_dir_path)

In [0]:
# Remove tmp directory file
tmp_dir = os.path.dirname(os.path.dirname(local_tmp_artifact_dir_path))
dbutils.fs.rm("file:" + tmp_dir, recurse=True)

In [0]:
# End MLflow run
mlflow.end_run() 