# Analyze canes data set

## Objectives

- Compute statistical distribution for dimensions and metrics
- Analyze the variance among bays in each maturity area in terms of the relevant metrics
- Explore distibution of initial fruit number compared to median cane diameter
- Compute distribution discrepancy between the cane diameter and positions

In [0]:
import os
import math
import mlflow
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import functions as F
from pyspark.sql.functions import col, count, when
sns.set_theme(style="darkgrid")

## Preamble

In [0]:
experiment_name = "/Lab_Project_FruitDropZGS/mlflow_experiment/analyze_canes_data_set_experiment"
mlflow.set_experiment(experiment_name)

In [0]:
run_mlflow = mlflow.start_run()

local_tmp_artifact_dir_path = "/Workspace/Lab_Project_FruitDropZGS/notebook_artifacts/tmp/" + run_mlflow.info.run_id + "/artifacts/"

if not os.path.exists(local_tmp_artifact_dir_path):
  os.makedirs(local_tmp_artifact_dir_path)
  
local_tmp_artifact_dir_path

## Configure parameters

In [0]:
from canes_params_all_orchards import params_all_orchards
from canes_params_select_kpin import params_select_kpin

params = params_all_orchards
# params = params_select_kpin

mlflow.log_params(params)

params

In [0]:
if params["KPIN"]:
  kpin_condition = col("KPIN").isin(params["KPIN"])
  mlflow.log_params({"KPIN": params["KPIN"]})
else:
  kpin_condition = col("KPIN").isNotNull() | col("KPIN").isNull()

## Load data

In [0]:
from canes_expected_schema import canes_expected_schema

canes_expected_schema

In [0]:
initial_sdf = spark.table(params['table_name'])
    
fruit_sdf = initial_sdf.filter(kpin_condition).cache()

fruit_count_sdf = fruit_sdf.count()
mlflow.log_metric("fruit_count",fruit_count_sdf)

fruit_sdf.printSchema()
fruit_sdf.show()
fruit_count_sdf

In [0]:
assert canes_expected_schema == initial_sdf.schema

### Count of missing values

In [0]:
fruit_sdf.select([count(when(col(c).isNull(), c)).alias(c) for c in fruit_sdf.columns]).show()

## Compute statistical distribution for dimensions

In [0]:
dimensions_summary_sdf = fruit_sdf.select(
  "grower",
  "KPIN",
  "maturity_area",
  "bay",
  "latitude",
  "longitude",
  "sq_meters_bay",
  "cane_number",
).summary()

dimensions_summary_sdf.printSchema()
dimensions_summary_sdf.show()

In [0]:
dimensions_summary_pdf = dimensions_summary_sdf.toPandas()
dimensions_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'dimensions_summary.csv', index=False)

## Compute statistical distribution for metrics

In [0]:
metrics_summary_sdf = fruit_sdf.select(
  "cane_length_cm",
  "cane_diameter_bottom_cm",
  "cane_diameter_middle_cm",
  "cane_diameter_top_cm",
  "king_fruit_number",
  "lateral_fruit_number",
  "initial_fruit_number",
).summary()

metrics_summary_sdf.printSchema()
metrics_summary_sdf.show()

In [0]:
metrics_summary_pdf = metrics_summary_sdf.toPandas()
metrics_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'metrics_summary.csv', index=False)

## Analyze the distribution of initial fruit count on the canes

In [0]:
initial_fruit_number_sdf = fruit_sdf.select("initial_fruit_number")

initial_fruit_number_sdf.printSchema()
initial_fruit_number_sdf.show()
initial_fruit_number_sdf.count()

In [0]:
initial_fruit_number_summary_sdf = initial_fruit_number_sdf.summary()
initial_fruit_number_summary_sdf.show()

In [0]:
initial_fruit_number_summary_pdf = initial_fruit_number_summary_sdf.toPandas()
initial_fruit_number_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'canes_initial_fruit_number_summary.csv', index=False)

## The variance among bays in each maturity area in terms of the initial fruit number

In [0]:
total_initial_fruit_number_among_bay_sdf = fruit_sdf.groupBy(["grower", "KPIN", "maturity_area", "bay"]).agg(
  F.sum("initial_fruit_number").alias("total_initial_fruit_number"),
)

total_initial_fruit_number_among_bay_sdf.printSchema()
total_initial_fruit_number_among_bay_sdf.show()
total_initial_fruit_number_among_bay_sdf.count()

In [0]:
total_initial_fruit_number_among_bay_pdf = total_initial_fruit_number_among_bay_sdf.toPandas()
total_initial_fruit_number_among_bay_pdf.info()
total_initial_fruit_number_among_bay_pdf

In [0]:
total_initial_fruit_number_among_bay_pdf.to_csv(local_tmp_artifact_dir_path + 'total_number_of_fruit_by_bay.csv', index=False)

### Distribution of total initial fruit number across bay by kpin and maturity area

In [0]:
total_initial_fruit_number_among_bay_pdf['KPIN_Maturity_Area'] = total_initial_fruit_number_among_bay_pdf['KPIN'].astype(str) + '_' + total_initial_fruit_number_among_bay_pdf['maturity_area']

fig, ax = plt.subplots(figsize=(16, 22))
sns.boxplot(x="total_initial_fruit_number", y="KPIN_Maturity_Area", data=total_initial_fruit_number_among_bay_pdf, orient="h", ax=ax)
ax.set_title(f"Distribution of total_initial_fruit_number across bay by kpin and maturity area")
ax.grid(True)
plt.show()
fig.savefig(local_tmp_artifact_dir_path + f"boxplot_for_distribution_of_total_initial_fruit_number.png")

## The variance among bays in each maturity area in terms of the median cane features

In [0]:
median_cane_features_sdf = fruit_sdf.groupBy(["grower", "KPIN", "maturity_area", "bay"]).agg(
F.median("cane_length_cm").alias("median_cane_length_cm"),
F.median("cane_diameter_bottom_cm").alias("median_cane_diameter_bottom_cm"),
F.median("cane_diameter_middle_cm").alias("median_cane_diameter_middle_cm"),
F.median("cane_diameter_top_cm").alias("median_cane_diameter_top_cm"),
)
                  
median_cane_features_sdf.printSchema()
median_cane_features_sdf.show()
median_cane_features_sdf.count()

In [0]:
median_cane_features_pdf = median_cane_features_sdf.toPandas()
median_cane_features_pdf.to_csv(local_tmp_artifact_dir_path + 'cane_features_median.csv', index=False)

### Distribution of cane features across bay by kpin and maturity area

In [0]:
median_cane_features_pdf['KPIN_Maturity_Area'] = median_cane_features_pdf['KPIN'].astype(str) + '_' + median_cane_features_pdf['maturity_area']

features = median_cane_features_pdf.iloc[:,-5:-1]

for feature in features:
    fig, ax = plt.subplots(figsize=(16, 22))
    sns.boxplot(x=feature, y="KPIN_Maturity_Area", data=median_cane_features_pdf, orient="h", ax=ax)
    ax.set_title(f"Distribution of {feature} across bay by kpin and maturity area")
    ax.grid(True)
    plt.show()
    fig.savefig(local_tmp_artifact_dir_path + f"boxplot_for_distribution_of_{feature}.png")

## Distribution of initial fruit number compared to cane middle diameter

In [0]:
distribution_of_initial_num_and_cane_middle_sdf = fruit_sdf.select("cane_diameter_middle_cm","initial_fruit_number")

distribution_of_initial_num_and_cane_middle_sdf.printSchema()
distribution_of_initial_num_and_cane_middle_sdf.show()
distribution_of_initial_num_and_cane_middle_sdf.count()

In [0]:
distribution_of_initial_num_and_cane_middle_pdf = distribution_of_initial_num_and_cane_middle_sdf.toPandas()

fig, ax = plt.subplots(2, 1, figsize=(22, 18))
sns.scatterplot(x="cane_diameter_middle_cm", y="initial_fruit_number", data=distribution_of_initial_num_and_cane_middle_pdf, ax=ax[0])
ax[0].set_title(f"Distribution of initial fruit number compared to cane middle diameter")
ax[0].grid(True)

bins = np.arange(
  math.floor(distribution_of_initial_num_and_cane_middle_pdf["cane_diameter_middle_cm"].min()),
  math.ceil(distribution_of_initial_num_and_cane_middle_pdf["cane_diameter_middle_cm"].max()),
  0.05,
)

sns.histplot(distribution_of_initial_num_and_cane_middle_pdf["cane_diameter_middle_cm"], bins=bins, ax=ax[1])
ax[1].set_title("Distribution density for cane middle diameter")
ax[1].set_xlabel("Cane middle diameter")
ax[1].set_ylabel("Frequency")
ax[1].set_yscale("log")

plt.tight_layout()
plt.show()

fig.savefig(local_tmp_artifact_dir_path + f"distribution_of_initial_fruit_number_comapred_to_cane_middle_diameter.png")

## Distribution discrepancy between the cane diameter and positions

In [0]:
fruit_cane_diameter_positions_diff_sdf = fruit_sdf.withColumn(
  "cane_diameter_bottom_middle_diff",col("cane_diameter_bottom_cm")-col("cane_diameter_middle_cm")
).withColumn("cane_diameter_middle_top_diff", col("cane_diameter_middle_cm")-col("cane_diameter_top_cm"))

In [0]:
fruit_cane_diameter_positions_diff_sdf.printSchema()
fruit_cane_diameter_positions_diff_sdf.show()
fruit_cane_diameter_positions_diff_sdf.count()

In [0]:
cane_diameter_positions_diff_sdf = fruit_cane_diameter_positions_diff_sdf.select(
  "grower",
  "KPIN",
  "maturity_area",
  "bay",
  "initial_fruit_number",
  "cane_diameter_bottom_middle_diff",
  "cane_diameter_middle_top_diff",
)
cane_diameter_positions_diff_sdf.show()
cane_diameter_positions_diff_sdf.count()

In [0]:
cane_diameter_positions_diff_pdf = cane_diameter_positions_diff_sdf.toPandas()

positions_diff = cane_diameter_positions_diff_pdf.iloc[:,-2:]

for position in positions_diff:
  fig, ax = plt.subplots(2, 1, figsize=(22, 18))
  sns.scatterplot(x=position, y="initial_fruit_number", data=cane_diameter_positions_diff_pdf, ax=ax[0])
  ax[0].set_title(f"Distribution discrepancy between initial fruit number and {position}")
  ax[0].grid(True)

  bins = np.arange(
    math.floor(cane_diameter_positions_diff_pdf[position].min()),
    math.ceil(cane_diameter_positions_diff_pdf[position].max()),
    0.05,
  )
  sns.histplot(cane_diameter_positions_diff_pdf[position], bins=bins, ax=ax[1])
  ax[1].set_title(f"Distribution density for discrepancy between initial fruit number and {position}")
  ax[1].set_ylabel("Frequency")
  ax[1].set_yscale("log")

  plt.show()
  fig.savefig(local_tmp_artifact_dir_path + f"distribution_discrepancy_of_{position}.png")

### Explore negative discrepancy values between cane diameter positions in terms of bays and maturity area

#### Negative discrepancy values between cane bottom and middle diameter in terms of bays and maturity area

In [0]:
negative_bottom_middle_cane_diameter_sdf = cane_diameter_positions_diff_sdf.select(
  "grower",
  "KPIN",
  "maturity_area",
  "bay",
  "initial_fruit_number",
  "cane_diameter_bottom_middle_diff",
).filter(cane_diameter_positions_diff_sdf["cane_diameter_bottom_middle_diff"] < 0)

negative_bottom_middle_cane_diameter_sdf.show()
negative_bottom_middle_cane_diameter_sdf.count()

In [0]:
negative_bottom_middle_cane_diameter_maturity_area_sdf = negative_bottom_middle_cane_diameter_sdf.groupBy(
  "grower",
  "KPIN",
  "maturity_area"
).count().withColumnRenamed(
  "count", "count_negative_values"
).orderBy("count", ascending=False)

negative_bottom_middle_cane_diameter_maturity_area_sdf.show()
negative_bottom_middle_cane_diameter_maturity_area_sdf.count()

In [0]:
negative_bottom_middle_cane_diameter_maturity_area_bay_sdf = negative_bottom_middle_cane_diameter_sdf.groupBy(
  "grower",
  "KPIN",
  "maturity_area",
  "bay"
).count().withColumnRenamed(
  "count", "count_negative_values"
).orderBy("count", ascending=False)

negative_bottom_middle_cane_diameter_maturity_area_bay_sdf.show()
negative_bottom_middle_cane_diameter_maturity_area_bay_sdf.count()

#### Negative discrepancy values between cane middle and top diameter in terms of bays and maturity area

In [0]:
negative_middle_top_cane_diameter_sdf = cane_diameter_positions_diff_sdf.select(
  "grower",
  "KPIN",
  "maturity_area",
  "bay",
  "initial_fruit_number",
  "cane_diameter_middle_top_diff",
).filter(cane_diameter_positions_diff_sdf["cane_diameter_middle_top_diff"] < 0)

negative_middle_top_cane_diameter_sdf.show()
negative_middle_top_cane_diameter_sdf.count()

In [0]:
negative_middle_top_cane_diameter_maturity_area_sdf = negative_middle_top_cane_diameter_sdf.groupBy(
  "grower",
  "KPIN",
  "maturity_area"
).count().withColumnRenamed(
  "count", "count_negative_values"
).orderBy("count", ascending=False)

negative_middle_top_cane_diameter_maturity_area_sdf.show()
negative_middle_top_cane_diameter_maturity_area_sdf.count()

In [0]:
negative_middle_top_cane_diameter_maturity_area_bay_sdf = negative_middle_top_cane_diameter_sdf.groupBy(
  "grower",
  "KPIN",
  "maturity_area",
  "bay"
).count().withColumnRenamed(
  "count", "count_negative_values"
).orderBy("count", ascending=False)

negative_middle_top_cane_diameter_maturity_area_bay_sdf.show()
negative_middle_top_cane_diameter_maturity_area_bay_sdf.count()

## Wrap up

In [0]:
mlflow.log_artifacts(local_tmp_artifact_dir_path)

In [0]:
# Remove tmp directory file
tmp_dir = os.path.dirname(os.path.dirname(local_tmp_artifact_dir_path))
dbutils.fs.rm("file:" + tmp_dir, recurse=True)

In [0]:
# End MLflow run
mlflow.end_run()