# Analyze fruit analysis data set

## Method
- Load fruit analysis data
- Analyze the distribution for each field
- Compute statistics for relevant fruit analysis
- Plot charts for relevant statistics

## Preamble

In [0]:
import os
import pandas as pd
import mlflow
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, count, when, lit

In [0]:
experiment_name = "/Lab_Project_FruitDropZGS/mlflow_experiment/analyze_fruit_analysis_data_set_experiment"
mlflow.set_experiment(experiment_name)

In [0]:
run_mlflow = mlflow.start_run()

local_tmp_artifact_dir_path = f"/Workspace/Lab_Project_FruitDropZGS/notebook_artifacts/tmp/{run_mlflow.info.run_id}/artifacts/"
if not os.path.exists(local_tmp_artifact_dir_path):
    os.makedirs(local_tmp_artifact_dir_path)
      
local_tmp_artifact_dir_path

## Configure parameters

In [0]:
from params_selected_grower_number import params_selected_grower_number
from params_all_orchards import params_all_orchards

params = params_all_orchards
# params = params_selected_grower_number

mlflow.log_params(params)

params

In [0]:
if params["select_grower_number"]:
  filter_grower_number = col("Grower Number").isin(params["select_grower_number"])
else:
  filter_grower_number = col("Grower Number").isNotNull() | col("Grower Number").isNull()

filter_conditions = filter_grower_number

filter_conditions

## Load data

In [0]:
from expected_schema import expected_schema

expected_schema

In [0]:
initial_sdf = spark.table(params["table_name"])

fruit_sdf = initial_sdf.filter(filter_conditions) 

fruit_sdf_count = fruit_sdf.count()
mlflow.log_metric("fruit_sdf_count", fruit_sdf_count)
  
fruit_sdf.printSchema()
fruit_sdf.show()
fruit_sdf_count

In [0]:
assert expected_schema == initial_sdf.schema

## Distribution of collect date by grower number and maturity area

In [0]:
fruit_grower_maturity_area_collect_date_sdf = fruit_sdf.groupBy(
  "Grower Number",
  "Maturity Area Name",
  "Collect Date"
).count()

fruit_grower_maturity_area_collect_date_sdf.show()
fruit_grower_maturity_area_collect_date_sdf.count()

In [0]:
fruit_grower_maturity_area_collect_date_summary_sdf = fruit_grower_maturity_area_collect_date_sdf.select("count").summary()
fruit_grower_maturity_area_collect_date_summary_sdf.show()

In [0]:
fruit_grower_maturity_area_collect_date_summary_pdf = fruit_grower_maturity_area_collect_date_summary_sdf.toPandas()
fruit_grower_maturity_area_collect_date_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'fruit_grower_maturity_area_collect_date_summary.csv', index=False)

### Count of distinct collect date by grower number and maturity area

In [0]:
fruit_distinct_collect_date_count_sdf = fruit_grower_maturity_area_collect_date_sdf.groupBy(
  "Grower Number",
  "Maturity Area Name"
).count().withColumnRenamed("count", "count_distinct_collect_date").orderBy("count_distinct_collect_date", ascending=False)

fruit_distinct_collect_date_count_pdf = fruit_distinct_collect_date_count_sdf.toPandas()

fruit_distinct_collect_date_count_pdf

In [0]:
fruit_distinct_collect_date_count_pdf.to_csv(local_tmp_artifact_dir_path + 'fruit_distinct_collect_date_count.csv', index=False)

## Count record by grower number, orchard and special instructions 

In [0]:
fruit_grower_number_orchard_specialIns_sdf = fruit_sdf.groupby([
  "Grower Number",
  "Orchard Name",
  "specialInstructions"
]).count().withColumnRenamed("count", "count_records").orderBy("Grower Number")

fruit_grower_number_orchard_specialIns_sdf.show()
fruit_grower_number_orchard_specialIns_sdf.count()

## Compute statistical distribution for relevant fruit analysis

In [0]:
fruit_relevant_analysis_sdf = fruit_sdf.select(
  "Dry Matter",
  "Brix Equatorial",
  "Fresh Weight",
  "Hue",
  "Pressure",
)

fruit_relevant_analysis_sdf.printSchema()
fruit_relevant_analysis_sdf.show()
fruit_relevant_analysis_sdf.count()

### Result: statistical distribution for relevant fruit analysis

In [0]:
fruit_relevant_analysis_summary_sdf = fruit_relevant_analysis_sdf.summary()
fruit_relevant_analysis_summary_sdf.show()

In [0]:
fruit_relevant_analysis_summary_pdf = fruit_relevant_analysis_summary_sdf.toPandas()
fruit_relevant_analysis_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'fruit_relevant_analysis_statistics_summary.csv', index=False)

## Analysis of relevant fruit analysis by grower number and sample time

### Split special instructions to sample time and positions

In [0]:
fruit_split_sdf = fruit_sdf.withColumn(
    "Sample Time",
    when(col("specialInstructions").startswith("ripening 1"), "ripening 1")
    .when(col("specialInstructions").startswith("ripening 2"), "ripening 2")
    .when(col("specialInstructions").startswith("HarvestTop"), "Harvest")
    .when(col("specialInstructions").startswith("HarvestMiddle"), "Harvest")
    .when(col("specialInstructions").startswith("HarvestBottom"), "Harvest")
    .when(col("specialInstructions").startswith("Post harvest"), "Post harvest")
    .otherwise(lit(None))
).withColumn(
    "Positions",
    when(col("specialInstructions").endswith("Bottom"), "Bottom")
    .when(col("specialInstructions").endswith("Middle"), "Middle")
    .when(col("specialInstructions").endswith("Top"), "Top")
    .when(col("specialInstructions").endswith("harvest"), "Harvest")
    .otherwise(lit(None))
)

fruit_split_sdf.printSchema()
fruit_split_sdf.show()
fruit_split_sdf.count()

## Distribution of collect date across sample time 

In [0]:
fruit_split_collect_date_across_sample_time_sdf = fruit_split_sdf.groupBy(
  "Sample Time"
).agg(
  F.min("Collect Date").alias("min_collect_date"),
  F.max("Collect Date").alias("max_collect_date"),
  (F.max("Collect Date") -  F.min("Collect Date")),
).orderBy("min_collect_date")

fruit_split_collect_date_across_sample_time_sdf.printSchema()
fruit_split_collect_date_across_sample_time_sdf.show()
fruit_split_collect_date_across_sample_time_sdf.count()

In [0]:
fruit_split_collect_date_across_sample_time_pdf = fruit_split_collect_date_across_sample_time_sdf.toPandas()
fruit_split_collect_date_across_sample_time_pdf.to_csv(local_tmp_artifact_dir_path + 'fruit_collect_date_across_sample_time.csv', index=False)

### Compare relevant fruit analysis by grower number and sample time

In [0]:
fruit_relevant_analysis_grower_num_sample_time_sdf = fruit_split_sdf.select(
  "Grower Number",
  "Sample Time",
  "Dry Matter",
  "Brix Equatorial",
  "Fresh Weight",
  "Hue",
  "Pressure",
)

fruit_relevant_analysis_grower_num_sample_time_sdf.printSchema()
fruit_relevant_analysis_grower_num_sample_time_sdf.show()
fruit_relevant_analysis_grower_num_sample_time_sdf.count()

In [0]:
fruit_relevant_analysis_grower_num_sample_time_pdf = fruit_relevant_analysis_grower_num_sample_time_sdf.toPandas()
fruit_relevant_analysis_grower_num_sample_time_pdf.info()
fruit_relevant_analysis_grower_num_sample_time_pdf

#### Result : compare relevant fruit analysis by grower number and sample time

In [0]:
relevant_analysis = fruit_relevant_analysis_grower_num_sample_time_pdf.select_dtypes(include=['float64'])

fruit_relevant_analysis_grower_num_sample_time_pdf["Grower Number"] = fruit_relevant_analysis_grower_num_sample_time_pdf["Grower Number"].astype(str)

fruit_relevant_analysis_grower_num_sample_time_pdf = fruit_relevant_analysis_grower_num_sample_time_pdf.sort_values(by="Grower Number", ascending=False)

for analysis in relevant_analysis:
    fig, ax = plt.subplots(figsize=(16, 22))
    sns.boxplot(x=analysis, y="Grower Number", hue="Sample Time", data=fruit_relevant_analysis_grower_num_sample_time_pdf, orient="h",ax=ax)
    ax.set_title(f"Fruit {analysis} by Grower Number")
    ax.grid()
    plt.show()
    fig.savefig(local_tmp_artifact_dir_path + f"analysis_{analysis}_by_grower_num_sampletime.png")

## Boxplot analysis of the distribution of fruit analysis across sample time by grower number and maturity area

In [0]:
fruit_relevant_analysis_grower_maturity_area_sample_time_sdf = fruit_split_sdf.select(
  "Grower Number",
  "Maturity Area Name",
  "Sample Time",
  "Dry Matter",
  "Brix Equatorial",
  "Fresh Weight",
  "Hue",
  "Pressure",
)

fruit_relevant_analysis_grower_maturity_area_sample_time_sdf.printSchema()
fruit_relevant_analysis_grower_maturity_area_sample_time_sdf.show()
fruit_relevant_analysis_grower_maturity_area_sample_time_sdf.count()

### Result: boxplot analysis of the distribution of fruit analysis across sample time by grower number and maturity area

In [0]:
fruit_relevant_analysis_grower_maturity_area_sample_time_pdf = fruit_relevant_analysis_grower_maturity_area_sample_time_sdf.toPandas()

fruit_relevant_analysis_grower_maturity_area_sample_time_pdf["Grower_Number_Maturity_Area"] = fruit_relevant_analysis_grower_maturity_area_sample_time_pdf["Grower Number"].astype(str) + "_" + fruit_relevant_analysis_grower_maturity_area_sample_time_pdf["Maturity Area Name"]

fruit_relevant_analysis_grower_maturity_area_sample_time_pdf = fruit_relevant_analysis_grower_maturity_area_sample_time_pdf.sort_values(by="Grower_Number_Maturity_Area", ascending=False)

relevant_analysis = fruit_relevant_analysis_grower_maturity_area_sample_time_pdf.select_dtypes(include=["float64"])

for analysis in relevant_analysis:
  fig, ax = plt.subplots(figsize=(16, 22))
  sns.boxplot(x=analysis, y="Grower_Number_Maturity_Area", hue="Sample Time", data=fruit_relevant_analysis_grower_maturity_area_sample_time_pdf, orient="h",ax=ax)
  ax.set_title(f"Fruit {analysis} by Grower Number and Maturity Area")
  ax.grid()
  plt.show()
  fig.savefig(local_tmp_artifact_dir_path + f"analysis_{analysis}_across_simple_time_by_grower_maturity_area.png")

## Wrap up

In [0]:
mlflow.log_artifacts(local_tmp_artifact_dir_path)

In [0]:
# Remove tmp directory file
tmp_dir = os.path.dirname(os.path.dirname(local_tmp_artifact_dir_path))
dbutils.fs.rm("file:" + tmp_dir, recurse=True)

In [0]:
# End MLflow run
mlflow.end_run() 