# Weather forecasting with PySpark
## Big Data Computing final project

###Define some global constants

In [0]:
DATASET_PATH: str = 'dbfs:/bigdata_proj/datasets/historical-hourly-weather/'
MODELS_PATH: str = 'dbfs:/bigdata_proj/models/historical-hourly-weather/'

RANDOM_SEED: int = 42
  
SLOW_OPERATIONS: bool = True
  
# True to save the computation of datataset preprocessing, fitted pipelines and trained models to the filesystem
SAVE_COMPUTATIONS: bool = True
  
# True to load the sampled dataset from the filesystem, False to compute it from the raw one
LOAD_SAMPLED_DATASET: bool = True
SAMPLED_DATASET_PATH: str = f'{DATASET_PATH}aggregated_sampled_weather_measurements.csv'
  
# True to load the encoding pipeline from the filesystem, False to compute it from scratch
LOAD_ECONDING_PIPELINE: bool = True
ENCODING_PIPELINE_PATH: str = f'{MODELS_PATH}data_encoder'

# True to load pretrained models from the filesystem, False to compute them from scratch
LOAD_PRETRAINED_MODELS: bool = True
RANDOM_FOREST_MODEL_PATH: str = f'{MODELS_PATH}rnd_forest'
RANDOM_FOREST_CROSS_VALIDATION_MODEL_PATH: str = f'{MODELS_PATH}rnd_forest_cv'
LOGISTIC_REGRESSION_CROSS_VALIDATION_MODEL_PATH: str = f'{MODELS_PATH}log_reg_cv'

# necessary due to DataBricks community edition limits (training on a dataframe larger than this threshold causes an Internal Server Error)
MAX_TRAIN_SIZE: int = 999_999

###Import PySpark packages and other dependencies

In [0]:
import pyspark
import numpy as np
import matplotlib.pyplot as plt

from pyspark import SparkContext, SparkConf
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.tuning import CrossValidatorModel

from typing import *

## Dataset initialization

### Download the dataset
Original source: [kaggle.com/selfishgene/historical-hourly-weather-data](https://www.kaggle.com/selfishgene/historical-hourly-weather-data)

In [0]:
%sh
wget --no-verbose https://github.com/andrea-gasparini/big-data-weather-forecasting/raw/master/dataset/historical-hourly-weather-dataset.zip -O /tmp/dataset.zip
unzip -u /tmp/dataset.zip -d /tmp/dataset

2024-03-16 17:05:24 URL:https://raw.githubusercontent.com/andrea-gasparini/big-data-weather-forecasting/master/dataset/historical-hourly-weather-dataset.zip [12655281/12655281] -> "/tmp/dataset.zip" [1]


Archive:  /tmp/dataset.zip
   creating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/._committed_7616641238230246128.crc  
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/.part-00000-tid-5785058191842647654-99694b27-5637-4d82-97fd-79413e3b2b1a-5515-1-c000.csv.crc  
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/.part-00006-tid-5785058191842647654-99694b27-5637-4d82-97fd-79413e3b2b1a-5521-1-c000.csv.crc  
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/.part-00002-tid-5785058191842647654-99694b27-5637-4d82-97fd-79413e3b2b1a-5517-1-c000.csv.crc  
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/.part-00004-tid-5785058191842647654-99694b27-5637-4d82-97fd-79413e3b2b1a-5519-1-c000.csv.crc  
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/part-00003-tid-5785058191842647654-99694b27-5637-4d82-97fd-79413

###Move the dataset from Databricks local driver node's file system to DBFS

In [0]:
for file in dbutils.fs.ls('file:/tmp/dataset'):
    dbutils.fs.mv(file.path, f'{DATASET_PATH}{file.name}', recurse=True)

In [0]:
%fs ls /bigdata_proj/datasets/historical-hourly-weather

path,name,size,modificationTime
dbfs:/bigdata_proj/datasets/historical-hourly-weather/aggregated_sampled_weather_measurements.csv/,aggregated_sampled_weather_measurements.csv/,0,1710607244000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/city_attributes.csv,city_attributes.csv,1614,1710608726000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/humidity.csv,humidity.csv,9075077,1710608725000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/pressure.csv,pressure.csv,12155911,1710608726000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/temperature.csv,temperature.csv,13971171,1710608726000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/weather_description.csv,weather_description.csv,21858089,1710608728000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/wind_direction.csv,wind_direction.csv,10171003,1710608726000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/wind_speed.csv,wind_speed.csv,7457531,1710608728000


###Load dataset into Spark DataFrame objects

### Dataset shape and schema
The raw dataset downloaded from kaggle is composed of 7 different `.csv` files:
- `city_attributes.csv` contains geographical information about the different cities for which there are weather measurements
- `weather_description.csv` contains the textual description of the weather conditions, where each column refers to a different city and each row refers to a specific `datetime` in which the weather condition occurred
- Each one of the other 5 csv follows the same structure as `weather_description.csv` and contains the measurements of the following metrics: `humidity`,  `pressure`, `temperature`, `wind_direction`, `wind_speed`

Except for `city_attributes.csv`, all the other files contains about **45.000** records of hourly weather measurements, that multiplied by the **36** cities results in approximately **1.500.000** records.

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# rest of your code
weather_conditions_df = spark.read.csv(f'{DATASET_PATH}weather_description.csv', header=True, inferSchema=True)
humidity_df = spark.read.csv(f'{DATASET_PATH}humidity.csv', header=True, inferSchema=True)
pressure_df = spark.read.csv(f'{DATASET_PATH}pressure.csv', header=True, inferSchema=True)
temperature_df = spark.read.csv(f'{DATASET_PATH}temperature.csv', header=True, inferSchema=True)
city_attributes_df = spark.read.csv(f'{DATASET_PATH}city_attributes.csv', header=True, inferSchema=True)
wind_direction_df = spark.read.csv(f'{DATASET_PATH}wind_direction.csv', header=True, inferSchema=True)
wind_speed_df = spark.read.csv(f'{DATASET_PATH}wind_speed.csv', header=True, inferSchema=True)