# Analyze relationship between weather stress and fruit drop

## Objective

Analyze impact of weather stress on fruit drop

## Method

![cross_reference_data_sets.drawio.svg](../diagrams/cross_reference_data_sets.drawio.svg)

## Preamble

In [0]:
import os
import sys
import mlflow
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression 
from pyspark.sql.functions import min, lit, col, lag
from pyspark.sql.functions import month, year, when, to_date
sys.path.append(os.path.abspath('..'))

In [0]:
experiment_name = "/Lab_Project_FruitDropZGS/mlflow_experiment/analyze_weather_and_fruit_drop_data_set_experiment"
mlflow.set_experiment(experiment_name)

In [0]:
run_mlflow = mlflow.start_run()

local_tmp_artifact_dir_path = "/Workspace/Lab_Project_FruitDropZGS/notebook_artifacts/tmp/" + run_mlflow.info.run_id + "/artifacts/"

if not os.path.exists(local_tmp_artifact_dir_path):
  os.makedirs(local_tmp_artifact_dir_path)
  
local_tmp_artifact_dir_path

## Configure experiment

Specify parameters such as data set, conditions, and other variables that will be used in the experiment

In [0]:
from cane_level_fruit_drop_and_maturity_area_weather_data_sets import *

params = famous_differentiation

mlflow.log_params(params)

params

## Load data

### Load joint maturity area data set schema 

In [0]:
from maturity_area_data.joint_maturity_area_expected_schema import joint_maturity_area_expected_schema

joint_maturity_area_expected_schema

### Load joint maturity area data set 

In [0]:
joint_maturity_area_sdf = spark.table(params["joined_maturity_area_data_set"])

joint_maturity_area_count_sdf = joint_maturity_area_sdf.count()
mlflow.log_metric("joint_maturity_area_count", joint_maturity_area_count_sdf)

joint_maturity_area_sdf.printSchema()
joint_maturity_area_sdf.show()
joint_maturity_area_count_sdf

In [0]:
assert joint_maturity_area_expected_schema == joint_maturity_area_sdf.schema

### Load joint weather data set schema

In [0]:
from weather_analysis_data.merged_weather_data_expected_schema import merged_weather_data_expected_schema

merged_weather_data_expected_schema

### Load joint weather data set

In [0]:
merged_weather_data_sdf = spark.table(params["merged_weather_data_set"])

merged_weather_data_count_sdf = merged_weather_data_sdf.count()
mlflow.log_metric("merged_weather_data_count", merged_weather_data_count_sdf)

merged_weather_data_sdf.printSchema()
merged_weather_data_sdf.show()
merged_weather_data_count_sdf

In [0]:
assert merged_weather_data_expected_schema == merged_weather_data_sdf.schema

### Load fruit drop survey data set schema

In [0]:
from fruit_drop_data.fruit_drop_expected_schema import fruit_drop_expected_schema

fruit_drop_expected_schema

### Load fruit drop survey data set 

In [0]:
fruit_drop_data_sdf = spark.table(params["fruit_drop_data_set"])

fruit_drop_data_count_sdf = fruit_drop_data_sdf.count()
mlflow.log_metric("fruit_drop_data_count", fruit_drop_data_count_sdf)

fruit_drop_data_sdf.printSchema()
fruit_drop_data_sdf.show()
fruit_drop_data_count_sdf

In [0]:
assert fruit_drop_expected_schema == fruit_drop_data_sdf.schema

### Load cane data set schema

In [0]:
from canes_features_data.canes_expected_schema import canes_expected_schema

canes_expected_schema

### Load cane data set

In [0]:
canes_data_sdf = spark.table(params["canes_data_set"])

canes_data_count_sdf = canes_data_sdf.count()
mlflow.log_metric("canes_data_count", canes_data_count_sdf)

canes_data_sdf.printSchema()
canes_data_sdf.show()
canes_data_count_sdf

In [0]:
assert canes_expected_schema == canes_data_sdf.schema

## [Optional] Filter complete weather time series

### Filter weather data for weather stations relevant to the maturity areas

In [0]:
distinct_stations_sdf = joint_maturity_area_sdf.select("WeatherStation").distinct()

selected_merge_weather_station_sdf = merged_weather_data_sdf.join(distinct_stations_sdf, distinct_stations_sdf.WeatherStation == merged_weather_data_sdf.station).drop("WeatherStation")

selected_merge_weather_station_sdf.printSchema()
selected_merge_weather_station_sdf.show()
selected_merge_weather_station_sdf.count()

In [0]:
selected_merge_weather_station_summary_sdf = selected_merge_weather_station_sdf.summary()
selected_merge_weather_station_summary_sdf.show()

In [0]:
selected_merge_weather_station_summary_pdf = selected_merge_weather_station_summary_sdf.toPandas()
selected_merge_weather_station_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'selected_merge_weather_station_summary.csv', index=False)

### Filter complete time series of joint maturity area and weather data set 

If the filtering configuration parameter is set to true, filter the weather time series; 

Otherwise, use all records in the weather data set.

In [0]:
from weather_analysis_data.filter_complete_time_series_weather_data import filterCompleteWeatherTimeSeries

if params["filter_weather_time_series"]:
  complete_timeseries_sdf = filterCompleteWeatherTimeSeries(selected_merge_weather_station_sdf).cache()

  complete_hourly_timeseries_weather_data_sdf = complete_timeseries_sdf.join(selected_merge_weather_station_sdf, "datetime")
else:
  complete_hourly_timeseries_weather_data_sdf = merged_weather_data_sdf

In [0]:
complete_hourly_time_series_weather_data_count_sdf = complete_hourly_timeseries_weather_data_sdf.count()
mlflow.log_metric("complete_hourly_time_series_weather_data_count", complete_hourly_time_series_weather_data_count_sdf)

complete_hourly_timeseries_weather_data_sdf.printSchema()
complete_hourly_timeseries_weather_data_sdf.show()
complete_hourly_time_series_weather_data_count_sdf

## Aggregate fruit drop metrics from multiple cane positions

Given that we analyze the relationship between weather stress and fruit drop by cane, we:
1. aggregate the fruit drop survey metrics across the whole cane; and
2. sum all dropped fruit counts

### Aggregate fruit drop metrics by cane

In [0]:
aggregate_fruit_drop_metric_by_cane_sdf = fruit_drop_data_sdf.groupBy("Grower", "KPIN", "Maturity Area", "Bay", "Cane number", "assessment date").agg(
  F.sum("shrinked fruit still attached on the cane").alias("shrinked_fruit_on_cane_across_positions"),
  F.sum("not dropped fruit with dry peduncle").alias("not_dropped_dry_peduncle_across_positions"),
  F.sum("dropped fruit with dry peduncle").alias("dropped_dry_peduncle_across_positions"),
  F.sum("dropped fruit with healthy peduncle").alias("dropped_healthy_peduncle_across_positions")
)

aggregate_fruit_drop_metric_by_cane_count_sdf = aggregate_fruit_drop_metric_by_cane_sdf.count()
mlflow.log_metric("aggregate_fruit_drop_metric_by_cane_count", aggregate_fruit_drop_metric_by_cane_count_sdf)

aggregate_fruit_drop_metric_by_cane_sdf.printSchema()
aggregate_fruit_drop_metric_by_cane_sdf.show()
aggregate_fruit_drop_metric_by_cane_count_sdf

### Sum dropped fruit with dry and healthy peduncles

In [0]:
sum_dropped_dry_and_healthy_fruit_sdf = aggregate_fruit_drop_metric_by_cane_sdf.withColumn(
  "total_dropped_dry_and_healthy_peduncle", col("dropped_dry_peduncle_across_positions") + col("dropped_healthy_peduncle_across_positions")
).select(
  "Grower", 
  "KPIN", 
  "Maturity Area", 
  "Bay", 
  "Cane number", 
  "assessment date",
  "total_dropped_dry_and_healthy_peduncle"
).orderBy("total_dropped_dry_and_healthy_peduncle", ascending=False)

sum_dropped_dry_and_healthy_fruit_sdf.printSchema()
sum_dropped_dry_and_healthy_fruit_sdf.show()
sum_dropped_dry_and_healthy_fruit_sdf.count()

In [0]:
sum_dropped_dry_and_healthy_fruit_summary_sdf = sum_dropped_dry_and_healthy_fruit_sdf.summary()
sum_dropped_dry_and_healthy_fruit_summary_sdf.show()

## Cross-reference total dropped fruit and canes data set by grower, KPIN, maturity area, bay, and cane number

In [0]:
cane_level_fruit_drop_with_cane_details_sdf = canes_data_sdf.join(
  sum_dropped_dry_and_healthy_fruit_sdf,
  (sum_dropped_dry_and_healthy_fruit_sdf["Grower"] == canes_data_sdf["grower"]) &
  (sum_dropped_dry_and_healthy_fruit_sdf["KPIN"] == canes_data_sdf["KPIN"]) &
  (sum_dropped_dry_and_healthy_fruit_sdf["Maturity Area"] == canes_data_sdf["maturity_area"]) &
  (sum_dropped_dry_and_healthy_fruit_sdf["Bay"] == canes_data_sdf["bay"]) & 
  (sum_dropped_dry_and_healthy_fruit_sdf["Cane number"] == canes_data_sdf["cane_number"]),
  "inner"
).withColumnRenamed(
  "assessment_date", "canes_assessment_date"
).withColumnRenamed(
  "assessment date", "fruit_drop_assessment_date"
).drop(
  sum_dropped_dry_and_healthy_fruit_sdf["KPIN"]
).drop(
  sum_dropped_dry_and_healthy_fruit_sdf["Grower"]
).drop(
  sum_dropped_dry_and_healthy_fruit_sdf["Bay"]
)

drop_columns = ["Maturity Area", "Cane number"]
cane_level_fruit_drop_with_cane_details_sdf = cane_level_fruit_drop_with_cane_details_sdf.drop(*drop_columns)

cane_level_fruit_drop_with_cane_details_sdf.printSchema()
cane_level_fruit_drop_with_cane_details_sdf.show()
cane_level_fruit_drop_with_cane_details_sdf.count()

## Cross-reference cane-level fruit drop data set with weather station from maturity area data

In [0]:
distinct_kpin_maturity_area_stations_sdf = joint_maturity_area_sdf.select("KPIN", "Maturity_Area", "WeatherStation").distinct()

distinct_kpin_maturity_area_stations_sdf.printSchema()
distinct_kpin_maturity_area_stations_sdf.show()
distinct_kpin_maturity_area_stations_sdf.count()

In [0]:
cane_level_fruit_drop_with_stations_sdf = cane_level_fruit_drop_with_cane_details_sdf.join(
  distinct_kpin_maturity_area_stations_sdf, 
  (cane_level_fruit_drop_with_cane_details_sdf["KPIN"] == distinct_kpin_maturity_area_stations_sdf["KPIN"]) &
  (cane_level_fruit_drop_with_cane_details_sdf["maturity_area"] == distinct_kpin_maturity_area_stations_sdf["Maturity_Area"])
).drop(
  distinct_kpin_maturity_area_stations_sdf["KPIN"]
).drop(
  distinct_kpin_maturity_area_stations_sdf["Maturity_Area"]
)

cane_level_fruit_drop_with_stations_sdf.printSchema()
cane_level_fruit_drop_with_stations_sdf.show()
cane_level_fruit_drop_with_stations_sdf.count()

## Compute fruit drop ratio and cumulative fruit drop ratio

### Load joint cane-level fruit drop schema

In [0]:
from joint_cane_level_fruit_drop_schema import joint_cane_level_fruit_drop_schema

joint_cane_level_fruit_drop_schema

In [0]:
from compute_joint_cane_level_fruit_drop_ratio import compute_joint_cane_level_fruit_drop_ratio

joint_cane_level_fruit_drop_sdf = compute_joint_cane_level_fruit_drop_ratio(cane_level_fruit_drop_with_stations_sdf)

joint_cane_level_fruit_drop_sdf.printSchema()
joint_cane_level_fruit_drop_sdf.show()
joint_cane_level_fruit_drop_sdf.count()

In [0]:
assert joint_cane_level_fruit_drop_schema == joint_cane_level_fruit_drop_sdf.schema

## Distribution of cumulative fruit drop ratio at last fruit drop survey

### Distribution of cumulative fruit drop ratio across bay by KPIN, maturity area and bay at last fruit drop survey

In [0]:
last_survey_avg_cumulative_fruit_drop_ratio_across_bay_sdf = joint_cane_level_fruit_drop_sdf.groupBy(
  "KPIN",
  "maturity_area",
  "bay"
).agg(
  F.max("fruit_drop_assessment_date").alias("last_fruit_drop_assessment_date"),
  F.avg("cumulative_fruit_drop_ratio").alias("last_survey_avg_cumulative_fruit_drop_ratio_across_bay")
).orderBy("KPIN", "bay", ascending=True)

last_survey_avg_cumulative_fruit_drop_ratio_across_bay_sdf.printSchema()
last_survey_avg_cumulative_fruit_drop_ratio_across_bay_sdf.show()
last_survey_avg_cumulative_fruit_drop_ratio_across_bay_sdf.count()

#### Result: Distribution of cumulative fruit drop ratio across bay by KPIN and maturity area at last fruit drop survey

In [0]:
last_survey_avg_cumulative_fruit_drop_ratio_across_bay_pdf = last_survey_avg_cumulative_fruit_drop_ratio_across_bay_sdf.toPandas()

last_survey_avg_cumulative_fruit_drop_ratio_across_bay_pdf["KPIN_maturity_area"] = last_survey_avg_cumulative_fruit_drop_ratio_across_bay_pdf["KPIN"].astype(str) + "_" + last_survey_avg_cumulative_fruit_drop_ratio_across_bay_pdf["maturity_area"].astype(str)

last_survey_avg_cumulative_fruit_drop_ratio_across_bay_pdf = last_survey_avg_cumulative_fruit_drop_ratio_across_bay_pdf.sort_values(by="last_survey_avg_cumulative_fruit_drop_ratio_across_bay", ascending=False)

fig, ax = plt.subplots(figsize=(16,22))

sns.boxplot(x="last_survey_avg_cumulative_fruit_drop_ratio_across_bay", y="KPIN_maturity_area", data=last_survey_avg_cumulative_fruit_drop_ratio_across_bay_pdf, orient="h", ax=ax)
ax.set_title("Distribution of average cumulative fruit drop ratio across bay by KPIN and maturity area at last fruit drop survey")
ax.grid(True)
plt.show()

fig.savefig(local_tmp_artifact_dir_path + f"distribution_last_survey_avg_cumulative_fruit_drop_ratio_across_bay_by_kpin_maturity_area.png")

### Distribution of cumulative fruit drop ratio across cane by KPIN, maturity area and bay at last fruit drop survey

In [0]:
last_survey_cumulative_fruit_drop_ratio_across_cane_sdf = joint_cane_level_fruit_drop_sdf.groupBy(
  "KPIN",
  "maturity_area",
  "bay",
  "cane_number",
).agg(
  F.max("fruit_drop_assessment_date").alias("last_fruit_drop_survey_assessment_date"),
  F.max("cumulative_fruit_drop_ratio").alias("last_survey_cumulative_fruit_drop_ratio_across_cane")
).orderBy("KPIN", "bay", ascending=True)

last_survey_cumulative_fruit_drop_ratio_across_cane_sdf.printSchema()
last_survey_cumulative_fruit_drop_ratio_across_cane_sdf.show()
last_survey_cumulative_fruit_drop_ratio_across_cane_sdf.count()

#### Result: Distribution of cumulative fruit drop across cane by KPIN, maturity area and bay at last fruit drop survey

In [0]:
last_survey_cumulative_fruit_drop_ratio_across_cane_pdf = last_survey_cumulative_fruit_drop_ratio_across_cane_sdf.toPandas()

last_survey_cumulative_fruit_drop_ratio_across_cane_pdf["KPIN_maturity_area_bay"] = last_survey_cumulative_fruit_drop_ratio_across_cane_pdf["KPIN"].astype(str) + "_" + last_survey_cumulative_fruit_drop_ratio_across_cane_pdf["maturity_area"].astype(str) + "_" + last_survey_cumulative_fruit_drop_ratio_across_cane_pdf["bay"].astype(str)

last_survey_cumulative_fruit_drop_ratio_across_cane_pdf = last_survey_cumulative_fruit_drop_ratio_across_cane_pdf.sort_values(by="KPIN_maturity_area_bay", ascending=False)

fig, ax = plt.subplots(figsize=(18,26))

sns.boxplot(x="last_survey_cumulative_fruit_drop_ratio_across_cane", y="KPIN_maturity_area_bay", data=last_survey_cumulative_fruit_drop_ratio_across_cane_pdf, orient="h", ax=ax)

ax.set_title("Distribution of cumulative fruit drop ratio across cane by KPIN, maturity area and bay at last fruit drop survey")

ax.grid(True)

fig.savefig(local_tmp_artifact_dir_path + f"distribution_last_cumulative_fruit_drop_ratio_across_cane_by_kpin_maturity_area_bay.png")
plt.show()

## Compute cumulative weather stress

### Filter high-temperature hours in weather data set

In [0]:
high_temperature_weather_sdf = complete_hourly_timeseries_weather_data_sdf.withColumn(
  "weather_date", 
  to_date("datetime")
).filter(
  complete_hourly_timeseries_weather_data_sdf.avg_temperature_C >= params["selected_high_temp_value"]
)

high_temperature_weather_count_sdf = high_temperature_weather_sdf.count()
mlflow.log_metric("high_temperature_weather_count", high_temperature_weather_count_sdf)

high_temperature_weather_sdf.printSchema()
high_temperature_weather_sdf.show()
high_temperature_weather_count_sdf

### Compute the cumulative count of high-temperature hours by weather station

#### Load cumulative weather stress schema

In [0]:
from cumulative_weather_stress_schema import cumulative_weather_stress_schema

cumulative_weather_stress_schema

In [0]:
from compute_cumulative_weather_stress import compute_cumulative_weather_stress

cumulative_weather_stress_sdf = compute_cumulative_weather_stress(high_temperature_weather_sdf)

cumulative_weather_stress_sdf.printSchema()
cumulative_weather_stress_sdf.show()
cumulative_weather_stress_sdf.count()

In [0]:
assert cumulative_weather_stress_schema == cumulative_weather_stress_sdf.schema

In [0]:
cumulative_weather_stress_summary_sdf = cumulative_weather_stress_sdf.summary()
cumulative_weather_stress_summary_sdf.show()

In [0]:
cumulative_weather_stress_summary_pdf = cumulative_weather_stress_summary_sdf.toPandas()
cumulative_weather_stress_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'cumulative_weather_stress_summary.csv', index=False)

### Cumulative weather stress by station

In [0]:
cumulative_weather_stress_by_station_sdf = cumulative_weather_stress_sdf.groupBy("station").count().withColumnRenamed("count", "cumulative_weather_stress").orderBy("cumulative_weather_stress", ascending=False)

cumulative_weather_stress_by_station_sdf.printSchema()
cumulative_weather_stress_by_station_sdf.show()
cumulative_weather_stress_by_station_sdf.count()

In [0]:
cumulative_weather_stress_by_station_pdf = cumulative_weather_stress_by_station_sdf.toPandas()
cumulative_weather_stress_by_station_pdf.to_csv(local_tmp_artifact_dir_path + 'cumulative_weather_stress_by_station.csv', index=False)

### Time chart of cumulative weather stress by station

In [0]:
cumulative_weather_stress_pdf = cumulative_weather_stress_sdf.toPandas()

fig, ax = plt.subplots(figsize=(28, 16))

for station, data in cumulative_weather_stress_pdf.groupby('station'):
    ax.plot(data['date'], data['cumulative_weather_stress'], label=station)

ax.set_title("Cumulative hourly weather stress by station")
ax.set_xlabel("Date")
ax.set_ylabel("Cumulative Weather Stress")
ax.grid(True)
ax.legend()

fig.savefig(local_tmp_artifact_dir_path + f"time_chart_cumulative_weather_stress_pdf.png")
plt.show()

## Cross-reference cane-level fruit drop ratio to cumulative weather stress

### Load joint fruit drop cumulative weather stress schema

In [0]:
from joint_fruit_drop_cumulative_weather_stress_schema import joint_fruit_drop_cumulative_weather_stress_schema

joint_fruit_drop_cumulative_weather_stress_schema

In [0]:
from cross_reference_cane_level_fruit_drop_and_weather_stress import cross_reference_cane_level_fruit_drop_and_weather_stress

joint_fruit_drop_cumulative_weather_stress_sdf = cross_reference_cane_level_fruit_drop_and_weather_stress(joint_cane_level_fruit_drop_sdf, cumulative_weather_stress_sdf)

joint_fruit_drop_cumulative_weather_stress_count_sdf = joint_fruit_drop_cumulative_weather_stress_sdf.count()
mlflow.log_metric("joint_fruit_drop_cumulative_weather_stress_count", joint_fruit_drop_cumulative_weather_stress_count_sdf)

joint_fruit_drop_cumulative_weather_stress_sdf.printSchema()
joint_fruit_drop_cumulative_weather_stress_sdf.show()
joint_fruit_drop_cumulative_weather_stress_count_sdf

In [0]:
assert joint_fruit_drop_cumulative_weather_stress_schema == joint_fruit_drop_cumulative_weather_stress_sdf.schema

In [0]:
joint_fruit_drop_cumulative_weather_stress_summary_sdf = joint_fruit_drop_cumulative_weather_stress_sdf.summary()

joint_fruit_drop_cumulative_weather_stress_summary_sdf.show()

In [0]:
joint_fruit_drop_cumulative_weather_stress_summary_pdf = joint_fruit_drop_cumulative_weather_stress_summary_sdf.toPandas()
joint_fruit_drop_cumulative_weather_stress_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'joint_fruit_drop_cumulative_weather_stress_summary.csv', index=False)

### Validate joint fruit drop cumulative weather stress data

In [0]:
from joint_fruit_drop_cumulative_weather_stress_schema import validate_joint_fruit_drop_cumulative_weather_stress_sdf

subset_invalid_records_sdf = validate_joint_fruit_drop_cumulative_weather_stress_sdf(joint_fruit_drop_cumulative_weather_stress_sdf)

if subset_invalid_records_sdf is not None:
  subset_invalid_records_sdf.printSchema()
  subset_invalid_records_sdf.show()
  invalid_records_count = subset_invalid_records_sdf.count()
  if invalid_records_count > 0:
    raise AssertionError(f"{invalid_records_count} invalid record(s) found in joint fruit drop cumulative weather stress data.")
else:
    print("No invalid records found.")

### Result: Heatmap (hexbin) of cumulative fruit drop ratio and cumulative weather stress

In [0]:
joint_fruit_drop_cumulative_weather_stress_pdf = joint_fruit_drop_cumulative_weather_stress_sdf.toPandas()

fig, ax = plt.subplots(figsize=(22, 14))

joint_fruit_drop_cumulative_weather_stress_pdf.plot.hexbin(
  "cumulative_weather_stress",
  "cumulative_fruit_drop_ratio",
  gridsize=40,
  ax=ax,
  edgecolors="grey",
  cmap="inferno",
  mincnt=1
)

ax.grid(True)
plt.title("Distribution of cumulative fruit drop and cumulative weather stress")

fig.savefig(local_tmp_artifact_dir_path + f"distribution_cumulative_fruit_drop_and_cumulative_weather_stress.png")

### Result: Scatter chart of cumulative fruit drop ratio and cumulative weather stress

In [0]:
fig, ax = plt.subplots(figsize=(16, 14))

sns.regplot(data=joint_fruit_drop_cumulative_weather_stress_pdf, x="cumulative_weather_stress", y="cumulative_fruit_drop_ratio", order=1, ax=ax, color="orange", label="linear")
sns.regplot(data=joint_fruit_drop_cumulative_weather_stress_pdf, x="cumulative_weather_stress", y="cumulative_fruit_drop_ratio", order=2, ax=ax, color="green", label="quadratic")
sns.scatterplot(data=joint_fruit_drop_cumulative_weather_stress_pdf, x="cumulative_weather_stress", y="cumulative_fruit_drop_ratio", ax=ax, color="royalblue")

ax.set_xlabel("cumulative weather stress")
ax.set_ylabel("cumulative fruit drop ratio")
ax.set_title("Scatter plot of cumulative fruit drop ratio vs. cumulative weather stress")
ax.grid(True)
ax.legend()
fig.savefig(local_tmp_artifact_dir_path + f"scatterplot_fruit_drop_cumulative_weather_stress.png")
plt.show()

### Compute regression metrics for cumulative fruit drop ratio and cumulative weather stress

In [0]:
if not (joint_fruit_drop_cumulative_weather_stress_pdf[["cumulative_fruit_drop_ratio", "cumulative_weather_stress"]].isnull().all().any()):
  joint_fruit_drop_cumulative_weather_stress_pdf.dropna(subset=["cumulative_fruit_drop_ratio", "cumulative_weather_stress"], inplace=True)

  predictors = joint_fruit_drop_cumulative_weather_stress_pdf[["cumulative_weather_stress"]]
  outcome = joint_fruit_drop_cumulative_weather_stress_pdf["cumulative_fruit_drop_ratio"]

  # Linear Regression
  regression_model = LinearRegression()
  regression_model.fit(predictors, outcome)

  y_predicted = regression_model.predict(predictors)
  mse = mean_squared_error(outcome, y_predicted)
  mae = mean_absolute_error(outcome, y_predicted)

  mlflow.log_metric("MSE_cumulative_weather_stress_fruit_drop_ratio", mse)
  mlflow.log_metric("MAE_cumulative_weather_stress_fruit_drop_ratio", mae)
  mlflow.log_metric("Slope_cumulative_weather_stress_fruit_drop_ratio", regression_model.coef_)

  print("Slope:", regression_model.coef_) 
  print("Mean squared error (Linear):", mse)
  print("Mean absolute error (Linear):", mae)

  # Quadratic Regression
  regression_pipeline = Pipeline([("poly", PolynomialFeatures(degree=2)), ("linear", LinearRegression(fit_intercept=False))])
  regression_pipeline.fit(predictors, outcome)

  y_predicted_quad = regression_pipeline.predict(predictors)
  mse_quad = mean_squared_error(outcome, y_predicted_quad)
  mae_quad = mean_absolute_error(outcome, y_predicted_quad)

  mlflow.log_metric("MSE_quadratic_cumulative_weather_stress_fruit_drop_ratio", mse_quad)
  mlflow.log_metric("MAE_quadratic_cumulative_weather_stress_fruit_drop_ratio", mae_quad)

  print("Mean squared error (Quadratic):", mse_quad)
  print("Mean absolute error (Quadratic):", mae_quad)

else:
  print("No records to perform linear regression.")

## Compute vapour pressure deficit

Vapor pressure deficit (VPD) is determined by integrating daily maximum and minimum air temperatures with corresponding relative humidity data 

obtained from the average hourly temperature and average hourly humidity percentage within a weather dataset.

> Referencing the procedures outlined in https://www.fao.org/4/x0490e/x0490e07.htm#calculation%20procedures

1. Use equation 11 and 12 to compute mean saturation vapour pressure (es)
2. Apply equation 17 to derive actual vapour pressure (ea) from relative humidity data
3. Compute vapour pressure deficit (es - ea)

### Load vapour pressure deficit with weather data details schema

In [0]:
from weather_analysis_data.vapour_pressure_deficit_schema import vapour_pressure_deficit_schema

vapour_pressure_deficit_schema

In [0]:
from weather_analysis_data.compute_vapour_pressure_deficit import compute_vapour_pressure_deficit

vapour_pressure_deficit_sdf = compute_vapour_pressure_deficit(complete_hourly_timeseries_weather_data_sdf, 1)

vapour_pressure_deficit_sdf.printSchema()
vapour_pressure_deficit_sdf.show()
vapour_pressure_deficit_sdf.count()

In [0]:
assert vapour_pressure_deficit_schema == vapour_pressure_deficit_sdf.schema

In [0]:
vapour_pressure_deficit_pdf = vapour_pressure_deficit_sdf.toPandas()

vapour_pressure_deficit_summary_pdf = vapour_pressure_deficit_pdf.describe()

vapour_pressure_deficit_summary_pdf

In [0]:
vapour_pressure_deficit_summary_pdf.reset_index().to_csv(local_tmp_artifact_dir_path + 'vapour_pressure_deficit_summary.csv', index=False)

### Time chart of vapour pressure deficit by station

In [0]:
fig, ax = plt.subplots(figsize=(28, 16))

for station, data in vapour_pressure_deficit_pdf.groupby('station'):
  ax.plot(data['date'], data['vpd'], label=station)

ax.set_title("Vapour pressure deficit by station")
ax.set_xlabel("Date")
ax.set_ylabel("Vapour pressure deficit (kPa)")
ax.grid(True)
ax.legend()

fig.savefig(local_tmp_artifact_dir_path + f"time_chart_vapour_pressure_deficit_pdf.png")
plt.show()

## CDF diagram for vapour pressure deficit by station

In [0]:
percentiles = np.arange(0, 1.02, 0.02)

grouped = vapour_pressure_deficit_pdf.groupby("station")
quantiles_pdf = grouped['vpd'].quantile(q=percentiles).unstack()
quantiles_vpd_by_station_pdf = quantiles_pdf.transpose().rename_axis(columns='quantiles')

quantiles_vpd_by_station_pdf

In [0]:
quantiles_vpd_by_station_pdf.reset_index().to_csv(local_tmp_artifact_dir_path + 'quantiles_vpd_by_station.csv', index=False)

### Result: CDF diagram for vapour pressure deficit by station

In [0]:
fig, ax = plt.subplots(figsize=(28, 18))

for station in quantiles_vpd_by_station_pdf.columns:
  plt.plot(
  quantiles_vpd_by_station_pdf[station],
  quantiles_vpd_by_station_pdf.index,
  label=station,
  marker="*",
)

plt.title("CDF diagram for vapour pressure deficit by station")
plt.ylabel("Quantiles")
plt.xlabel("Vapour pressure deficit (kPa)")
plt.legend(title="Station")
plt.grid(True)
plt.show()

fig.savefig(local_tmp_artifact_dir_path + f"cdf_vpd_by_station.png")

## Compute cumulative vapour pressure deficit

### Filter high daily vapour pressure deficit in weather data

In [0]:
high_vapour_pressure_deficit_sdf = vapour_pressure_deficit_sdf.filter(
  vapour_pressure_deficit_sdf.vpd >= params["selected_high_vapour_pressure_deficit_value"]
)

high_vapour_pressure_deficit_count_sdf = high_vapour_pressure_deficit_sdf.count()
mlflow.log_metric("high_vapour_pressure_deficit_count", high_vapour_pressure_deficit_count_sdf)

high_vapour_pressure_deficit_sdf.printSchema()
high_vapour_pressure_deficit_sdf.show()
high_vapour_pressure_deficit_count_sdf

### Compute cumulative high daily vapour pressure deficit

#### Load cumulative vapour pressure deficit schema

In [0]:
from cumulative_vapour_pressure_deficit_stress_schema import cumulative_vapour_pressure_deficit_stress_schema

cumulative_vapour_pressure_deficit_stress_schema

In [0]:
from compute_cumulative_vapour_pressure_deficit_stress import compute_cumulative_vapour_pressure_deficit_stress

cumulative_vapour_pressure_deficit_stress_sdf = compute_cumulative_vapour_pressure_deficit_stress(high_vapour_pressure_deficit_sdf)

cumulative_vapour_pressure_deficit_stress_sdf.printSchema()
cumulative_vapour_pressure_deficit_stress_sdf.show()
cumulative_vapour_pressure_deficit_stress_sdf.count()

In [0]:
assert cumulative_vapour_pressure_deficit_stress_schema == cumulative_vapour_pressure_deficit_stress_sdf.schema

### Cumulative vapour pressure deficit by station

In [0]:
cumulative_vapour_pressure_deficit_stress_by_station_sdf = cumulative_vapour_pressure_deficit_stress_sdf.groupBy("station").count().withColumnRenamed("count", "cumulative_vapour_pressure_deficit_stress").orderBy("cumulative_vapour_pressure_deficit_stress", ascending=False)

cumulative_vapour_pressure_deficit_stress_by_station_sdf.printSchema()
cumulative_vapour_pressure_deficit_stress_by_station_sdf.show()
cumulative_vapour_pressure_deficit_stress_by_station_sdf.count()

In [0]:
cumulative_vapour_pressure_deficit_stress_by_station_pdf = cumulative_vapour_pressure_deficit_stress_by_station_sdf.toPandas()
cumulative_vapour_pressure_deficit_stress_by_station_pdf.to_csv(local_tmp_artifact_dir_path + 'cumulative_vapour_pressure_deficit_stress_by_station.csv', index=False)

### Time chart of cumulative vapour pressure deficit stress by station

In [0]:
cumulative_vapour_pressure_deficit_stress_pdf = cumulative_vapour_pressure_deficit_stress_sdf.toPandas()

fig, ax = plt.subplots(figsize=(28, 16))

for station, data in cumulative_vapour_pressure_deficit_stress_pdf.groupby('station'):
    ax.plot(data['date'], data['cumulative_vapour_pressure_deficit_stress'], label=station)

ax.set_title("Cumulative daily vapour pressure deficit stress by station")
ax.set_xlabel("Date")
ax.set_ylabel("Cumulative vapour pressure deficit stress")
ax.grid(True)
ax.legend()

fig.savefig(local_tmp_artifact_dir_path + f"time_chart_cumulative_vapour_pressure_deficit_stress.png")
plt.show()

## Cross-reference cane-level fruit drop ratio to cumulative vapour pressure deficit stress

### Load joint cane-level fruit drop cumulative vapour pressure deficit schema

In [0]:
from joint_fruit_drop_cumulative_vpd_schema import joint_fruit_drop_cumulative_vpd_schema

joint_fruit_drop_cumulative_vpd_schema

In [0]:
from cross_reference_cane_level_fruit_drop_and_vpd_stress import cross_reference_cane_level_fruit_drop_and_vpd_stress

joint_fruit_drop_cumulative_vpd_stress_sdf = cross_reference_cane_level_fruit_drop_and_vpd_stress(joint_cane_level_fruit_drop_sdf, cumulative_vapour_pressure_deficit_stress_sdf)

joint_fruit_drop_cumulative_vpd_stress_count_sdf = joint_fruit_drop_cumulative_vpd_stress_sdf.count()
mlflow.log_metric("joint_fruit_drop_cumulative_vpd_stress_count", joint_fruit_drop_cumulative_vpd_stress_count_sdf)

joint_fruit_drop_cumulative_vpd_stress_sdf.printSchema()
joint_fruit_drop_cumulative_vpd_stress_sdf.show()
joint_fruit_drop_cumulative_vpd_stress_count_sdf

### Result: Heatmap (hexbin) of cumulative fruit drop ratio and cumulative vapour pressure deficit stress

In [0]:
joint_fruit_drop_cumulative_vpd_stress_pdf = joint_fruit_drop_cumulative_vpd_stress_sdf.toPandas()

fig, ax = plt.subplots(figsize=(22, 14))

joint_fruit_drop_cumulative_vpd_stress_pdf.plot.hexbin(
  "cumulative_vapour_pressure_deficit_stress",
  "cumulative_fruit_drop_ratio",
  gridsize=40,
  ax=ax,
  edgecolors="grey",
  cmap="inferno",
  mincnt=1
)

ax.grid(True)
plt.title("Distribution of cumulative fruit drop and cumulative vapour pressure deficit stress")

fig.savefig(local_tmp_artifact_dir_path + f"distribution_cumulative_fruit_drop_and_vpd_stress.png")

### Result: Scatter chart of cumulative fruit drop ratio and cumulative vapour pressure deficit stress

In [0]:
fig, ax = plt.subplots(figsize=(16, 14))

sns.regplot(data=joint_fruit_drop_cumulative_vpd_stress_pdf, x="cumulative_vapour_pressure_deficit_stress", y="cumulative_fruit_drop_ratio", order=1, ax=ax, color="orange", label="linear")
sns.regplot(data=joint_fruit_drop_cumulative_vpd_stress_pdf, x="cumulative_vapour_pressure_deficit_stress", y="cumulative_fruit_drop_ratio", order=2, ax=ax, color="green", label="quadratic")
sns.scatterplot(data=joint_fruit_drop_cumulative_vpd_stress_pdf, x="cumulative_vapour_pressure_deficit_stress", y="cumulative_fruit_drop_ratio", ax=ax, color="royalblue")

ax.set_title("Scatter plot of cumulative fruit drop ratio vs. cumulative vapour pressure deficit stress")
ax.grid(True)
ax.legend()

fig.savefig(local_tmp_artifact_dir_path + f"scatterplot_fruit_drop_cumulative_vpd_stress.png")
plt.show()

### Compute regression metrics for cumulative fruit drop ratio and cumulative vapour pressure deficit stress

In [0]:
if not (joint_fruit_drop_cumulative_vpd_stress_pdf[["cumulative_fruit_drop_ratio", "cumulative_vapour_pressure_deficit_stress"]].isnull().all().any()):

  joint_fruit_drop_cumulative_vpd_stress_pdf.dropna(subset=["cumulative_fruit_drop_ratio", "cumulative_vapour_pressure_deficit_stress"], inplace=True)

  predictors = joint_fruit_drop_cumulative_vpd_stress_pdf[["cumulative_vapour_pressure_deficit_stress"]]
  outcome = joint_fruit_drop_cumulative_vpd_stress_pdf["cumulative_fruit_drop_ratio"]

  # Linear Regression
  regression_model = LinearRegression()
  regression_model.fit(predictors, outcome)

  y_predicted = regression_model.predict(predictors)
  mse_linear = mean_squared_error(outcome, y_predicted)
  mae_linear = mean_absolute_error(outcome, y_predicted)

  mlflow.log_metric("MSE_cumulative_vpd_stress_fruit_drop_ratio", mse_linear)
  mlflow.log_metric("MAE_cumulative_vpd_stress_fruit_drop_ratio", mae_linear)
  mlflow.log_metric("Slope_cumulative_vpd_stress_fruit_drop_ratio", regression_model.coef_)

  print("Slope:", regression_model.coef_) 
  print("Mean squared error (Linear):", mse_linear)
  print("Mean absolute error (Linear):", mae_linear)

  # Quadratic Regression
  regression_pipeline = Pipeline([("poly", PolynomialFeatures(degree=2)), ("linear", LinearRegression(fit_intercept=False))])
  regression_pipeline.fit(predictors, outcome)

  y_predicted_quad = regression_pipeline.predict(predictors)
  mse_quad = mean_squared_error(outcome, y_predicted_quad)
  mae_quad = mean_absolute_error(outcome, y_predicted_quad)

  mlflow.log_metric("MSE_quadratic_cumulative_vpd_stress_fruit_drop_ratio", mse_quad)
  mlflow.log_metric("MAE_quadratic_cumulative_vpd_stress_fruit_drop_ratio", mae_quad)

  print("Mean squared error (Quadratic):", mse_quad)
  print("Mean absolute error (Quadratic):", mae_quad)

else:
  print("No records to perform linear regression.")

## Wrap up

In [0]:
mlflow.log_artifacts(local_tmp_artifact_dir_path) 

In [0]:
# Remove tmp directory file
tmp_dir = os.path.dirname(os.path.dirname(local_tmp_artifact_dir_path))
dbutils.fs.rm("file:" + tmp_dir, recurse=True)

In [0]:
# End MLflow run
mlflow.end_run()