# Analyze fruit drop data set

## Objectives

- Compute statistical distribution for dimensions and metrics
- Analyze the variance among bays in each maturity area in terms of fruit with conditions
- Explore distribution of fruit with conditions across combinations of bay and assessment date

## Preamble

In [0]:
import os
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import functions as F
from pyspark.sql.functions import col, count, when, month

In [0]:
experiment_name = "/Lab_Project_FruitDropZGS/mlflow_experiment/analyze_fruit_drop_data_set_experiment"
mlflow.set_experiment(experiment_name)

In [0]:
run_mlflow = mlflow.start_run()

local_tmp_artifact_dir_path = "/Workspace/Lab_Project_FruitDropZGS/notebook_artifacts/tmp/" + run_mlflow.info.run_id + "/artifacts/"

if not os.path.exists(local_tmp_artifact_dir_path):
  os.makedirs(local_tmp_artifact_dir_path)
  
local_tmp_artifact_dir_path

## Configure parameters

In [0]:
from fruit_drop_params_all_orchards import params_all_orchards
from fruit_drop_params_select_kpin import params_select_kpin

params = params_all_orchards
# params = params_select_kpin

mlflow.log_params(params)

params

In [0]:
if params["KPIN"]:
  kpin_condition = col("KPIN").isin(params["KPIN"])
  mlflow.log_params({"KPIN": params["KPIN"]})
else:
  kpin_condition = col("KPIN").isNotNull() | col("KPIN").isNull()

## Load data

In [0]:
from fruit_drop_expected_schema import fruit_drop_expected_schema

fruit_drop_expected_schema

In [0]:
initial_sdf = spark.table(params["table_name"])

fruit_drop_sdf = initial_sdf.filter(kpin_condition).cache()

fruit_drop_count_sdf = fruit_drop_sdf.count()
mlflow.log_metric("fruit_drop_count",fruit_drop_count_sdf)

fruit_drop_sdf.printSchema()
fruit_drop_sdf.show()
fruit_drop_count_sdf

In [0]:
assert fruit_drop_expected_schema == initial_sdf.schema

## Compute statistical distribution for dimensions 

In [0]:
dimensions_summary_sdf  = fruit_drop_sdf.select(
  "Grower",
  "KPIN",
  "Maturity Area",
  "Bay",
  "Cane number",
  "cane position"
).summary()

dimensions_summary_sdf.printSchema()
dimensions_summary_sdf.show()

## Compute statistical distribution for metrics

In [0]:
metrics_summary_sdf = fruit_drop_sdf.select(
  "shrinked fruit still attached on the cane", 
  "not dropped fruit with dry peduncle",
  "dropped fruit with dry peduncle", 
  "dropped fruit with healthy peduncle"
).summary()

metrics_summary_sdf.printSchema()
metrics_summary_sdf.show()

In [0]:
metrics_summary_pdf = metrics_summary_sdf.toPandas()
metrics_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'numerical_values_summary.csv',index=False)

## The variance among bays in each maturity area in terms of the fruit with conditions

In [0]:
total_count_by_fruit_condition_sdf = fruit_drop_sdf.groupBy(["Grower","KPIN","Maturity Area","bay","assessment date"]).agg(
    F.sum("shrinked fruit still attached on the cane").alias("total_shrinked_fruit_on_cane"),
    F.sum("not dropped fruit with dry peduncle").alias("total_not_dropped_dry_peduncle"),
    F.sum("dropped fruit with dry peduncle").alias("total_dropped_dry_peduncle"),
    F.sum("dropped fruit with healthy peduncle").alias("total_dropped_healthy_peduncle")
)

total_count_by_fruit_condition_sdf.show()
total_count_by_fruit_condition_sdf.count()

In [0]:
total_count_by_fruit_condition_pdf = total_count_by_fruit_condition_sdf.toPandas()
total_count_by_fruit_condition_pdf.to_csv(local_tmp_artifact_dir_path + 'total_count_by_fruit_condition.csv', index=False)

### Distribution of fruit with conditions across combinations of bay and assessment date by kpin and maturity area

In [0]:
total_count_by_fruit_condition_pdf['KPIN_Maturity_Area'] = total_count_by_fruit_condition_pdf['KPIN'].astype(str) + '_' + total_count_by_fruit_condition_pdf['Maturity Area']

features = total_count_by_fruit_condition_pdf.iloc[:,-5:-1]

for feature in features:
    fig, ax = plt.subplots(figsize=(16, 22))
    sns.boxplot(x=feature, y="KPIN_Maturity_Area", data=total_count_by_fruit_condition_pdf, orient="h", ax=ax)
    ax.set_title(f"Distribution of {feature} across combinations of bay and assessment date by kpin and maturity area")
    ax.grid()
    plt.show()
    fig.savefig(local_tmp_artifact_dir_path + f"boxplot_for_distribution_of_{feature}.png")

## Wrap up

In [0]:
mlflow.log_artifacts(local_tmp_artifact_dir_path)

In [0]:
# Remove tmp directory file
tmp_dir = os.path.dirname(os.path.dirname(local_tmp_artifact_dir_path))
dbutils.fs.rm("file:" + tmp_dir, recurse=True)

In [0]:
# End MLflow run
mlflow.end_run()