# Analyze raw weather data set

## Objectives 
1. Load raw data set
2. Check data set schema
3. Compute statistical distribution for data set features

## Preamble

In [0]:
import os
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import functions as F
from pyspark.sql.functions import col, count, when
sns.set_theme(style="darkgrid")

In [0]:
experiment_name = "/Lab_Project_FruitDropZGS/mlflow_experiment/analyze_raw_weather_data_set_experiment"
mlflow.set_experiment(experiment_name)

In [0]:
run_mlflow = mlflow.start_run()

local_tmp_artifact_dir_path = "/Workspace/Lab_Project_FruitDropZGS/notebook_artifacts/tmp/" + run_mlflow.info.run_id + "/artifacts/"

if not os.path.exists(local_tmp_artifact_dir_path):
  os.makedirs(local_tmp_artifact_dir_path)
  
local_tmp_artifact_dir_path

## Configure parameters

In [0]:
from raw_weather_data_sets import *

table_name = "metaponto_san_marco"

param = raw_weather_2024_data_sets[table_name]

mlflow.log_param("table_name", param)

param

## Load data

In [0]:
from raw_weather_data_expected_schema import expected_schema

expected_schema

In [0]:
initial_weather_sdf = spark.table(param)

initial_weather_dataset = mlflow.data.from_spark(initial_weather_sdf, name=f"initial_{table_name}")
mlflow.log_input(initial_weather_dataset)

initial_weather_count_sdf = initial_weather_sdf.count()
mlflow.log_metric("initial_weather_count",initial_weather_count_sdf)

initial_weather_sdf.printSchema()
initial_weather_sdf.show()
initial_weather_count_sdf

In [0]:
from preprocess_raw_weather_data import *

weather_sdf = preprocess(initial_weather_sdf)

revised_weather_dataset = mlflow.data.from_spark(weather_sdf, name=f"revised_{table_name}")
mlflow.log_input(revised_weather_dataset)

weather_count_sdf = weather_sdf.count()
mlflow.log_metric("revised_weather_count", weather_count_sdf)

weather_sdf.printSchema()
weather_sdf.show()
weather_count_sdf

In [0]:
assert expected_schema == weather_sdf.schema

## Compute statistical distribution

In [0]:
weather_summary_sdf = weather_sdf.summary()
weather_summary_sdf.show()

In [0]:
weather_summary_pdf = weather_summary_sdf.toPandas()
weather_summary_pdf.to_csv(local_tmp_artifact_dir_path + 'weather_summary.csv', index=False)

## Count of duplicate records by datetime

In [0]:
count_by_datetime_sdf = weather_sdf.groupBy("datetime").count().filter(col("count") > 1)

count_by_datetime_sdf.show()
count_by_datetime_sdf.count()

In [0]:
duplicate_records_count_by_datetime_sdf = count_by_datetime_sdf.groupBy("datetime").agg(F.sum("count").alias("count_duplicates"))

duplicate_records_count_by_datetime_sdf.show()
duplicate_records_count_by_datetime_sdf.count()

## Count of unique records

In [0]:
unique_weather_sdf = weather_sdf.dropDuplicates(['datetime'])

unique_weather_count_sdf = unique_weather_sdf.count()
mlflow.log_metric("unique_weather_count", unique_weather_count_sdf)

unique_weather_sdf.printSchema()
unique_weather_sdf.show()
unique_weather_count_sdf

In [0]:
weather_sdf = unique_weather_sdf

## Compute timestamp distribution of average temperature

In [0]:
weather_avg_temperature_sdf = weather_sdf.select("datetime", "avg_temperature_C")

weather_avg_temperature_sdf.printSchema()
weather_avg_temperature_sdf.show()
weather_avg_temperature_sdf.count()

In [0]:
weather_avg_temperature_pdf = weather_avg_temperature_sdf.toPandas()
reasmple_weather_avg_temperature_pdf = weather_avg_temperature_pdf.set_index("datetime").resample("D").mean()
reasmple_weather_avg_temperature_pdf

In [0]:
fig, ax = plt.subplots(figsize=(28, 18))
sns.lineplot(data=reasmple_weather_avg_temperature_pdf, x="datetime", y="avg_temperature_C", ax=ax)
ax.set_title(f"Distribution of average temperature in {table_name}")
ax.grid(True)
plt.show()
fig.savefig(local_tmp_artifact_dir_path + f"distribution_of_avg_weather_temperature.png")

## Wrap up

In [0]:
mlflow.log_artifacts(local_tmp_artifact_dir_path)

In [0]:
# Remove tmp directory file
tmp_dir = os.path.dirname(os.path.dirname(local_tmp_artifact_dir_path))
dbutils.fs.rm("file:" + tmp_dir, recurse=True)

In [0]:
# End MLflow run
mlflow.end_run()