# Data pre-processing

### Import libraries

In [None]:
import os
import requests
import calendar
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from dotenv import load_dotenv
from datetime import datetime, timedelta
from IPython.display import clear_output

import pyspark.pandas as ps
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

load_dotenv()

In [None]:
DATASET_PATH = 'datasets/historical-hourly-weather-dataset/'
AGGREGATED_DATASET_PATH = 'datasets/historical-hourly-weather-dataset/tmp'

### Data expansion

Support functions

In [None]:
def call_api_to_get_data(date, enddate, city = "Ho+Chi+Minh+City"):

    link = "http://api.worldweatheronline.com/premium/v1/past-weather.ashx?q={city}&date={date}&enddate={enddate}&key={api_key}&format=json&tp=1".format(
        date =date,
        enddate = enddate,
        city = city,
        api_key = os.getenv("api_key")
    )
    response = requests.get(link)
    return response.json()

def get_lst_first_day_last_day(start_date = datetime(2012, 10, 1), end_date = datetime(2017, 11, 30)):

    date_format = "%Y-%m-%d"

    current_date = start_date
    dates_list = []

    while current_date <= end_date:
        last_day_of_month = calendar.monthrange(current_date.year, current_date.month)[1]

        first_day_of_month = current_date.replace(day = 1)
        last_day_of_month = current_date.replace(day = last_day_of_month)

        dates_list.append((first_day_of_month.strftime(date_format), last_day_of_month.strftime(date_format)))

        current_date = last_day_of_month + timedelta(days=1)

    return dates_list

def handle_append_data(json, humidity_arr, time_arr, wind_dir_arr, wind_speed_arr, pressure_arr, weather_desc_arr , temp_arr):

    for day_data in json["data"]["weather"]:

        date = day_data["date"]
        for hour_data in day_data['hourly']:
            time = "{date} {time}:00:00".format(date = date, time = int(int(hour_data["time"])/100))
            wind_speed_ms = round(float(hour_data["windspeedKmph"])* (10/36),2)
            wind_dir = int(hour_data["winddirDegree"])
            humidity = int(hour_data["humidity"])
            pressure = int(hour_data["pressure"])
            tempK = int(hour_data["tempC"]) + 273.15
            weather_desc = hour_data["weatherDesc"][0]["value"]

            time_arr = np.append(time_arr, time)
            humidity_arr = np.append(humidity_arr ,  humidity)
            wind_speed_arr = np.append(wind_speed_arr, wind_speed_ms)
            wind_dir_arr = np.append(wind_dir_arr, wind_dir)
            pressure_arr = np.append(pressure_arr, pressure)
            temp_arr = np.append(temp_arr, tempK)
            weather_desc_arr = np.append(weather_desc_arr, weather_desc)

    return humidity_arr, time_arr, wind_dir_arr, wind_speed_arr, pressure_arr, weather_desc_arr , temp_arr

Read data into DataFrame

In [None]:
df_humidity = pd.read_csv("historical-hourly-weather-dataset/humidity.csv")
df_pressure = pd.read_csv("historical-hourly-weather-dataset/pressure.csv")
df_temperature = pd.read_csv("historical-hourly-weather-dataset/temperature.csv")
df_weather_desc = pd.read_csv("historical-hourly-weather-dataset/weather_description.csv")
df_wind_dir = pd.read_csv("historical-hourly-weather-dataset/wind_direction.csv")
df_wind_speed = pd.read_csv("historical-hourly-weather-dataset/wind_speed.csv")

Crawl Ho Chi Minh city to merge old data

In [None]:
humidity_arr = np.array([])
time_arr = np.array([])
wind_dir_arr = np.array([])
wind_speed_arr = np.array([])
pressure_arr = np.array([])
temp_arr = np.array([])
weather_desc_arr = np.array([], dtype = object)
count = 0
dates_list = get_lst_first_day_last_day()
for date in dates_list:
    clear_output(wait = True)
    print(count / len(dates_list) * 100)
    count += 1
    json = call_api_to_get_data(date[0], date[1])
    humidity_arr, time_arr, wind_dir_arr, wind_speed_arr, pressure_arr, weather_desc_arr , temp_arr = handle_append_data(
        json, humidity_arr, time_arr, wind_dir_arr, wind_speed_arr, pressure_arr, weather_desc_arr , temp_arr)

df_hcm_humidity = pd.DataFrame({"datetime": time_arr, "Ho Chi Minh City": humidity_arr})
df_hcm_pressure = pd.DataFrame({"datetime": time_arr, "Ho Chi Minh City": pressure_arr})
df_hcm_wind_dir = pd.DataFrame({"datetime": time_arr, "Ho Chi Minh City": wind_dir_arr})
df_hcm_wind_speed = pd.DataFrame({"datetime": time_arr, "Ho Chi Minh City": wind_speed_arr})
df_hcm_temp= pd.DataFrame({"datetime": time_arr, "Ho Chi Minh City": temp_arr})
df_hcm_weather_desc = pd.DataFrame({"datetime": time_arr, "Ho Chi Minh City": weather_desc_arr})

df_humidity = pd.merge(df_humidity, df_hcm_humidity, on = "datetime")
df_pressure = pd.merge(df_pressure, df_hcm_pressure, on = "datetime")
df_wind_dir = pd.merge(df_wind_dir, df_hcm_wind_dir, on = "datetime")
df_wind_speed = pd.merge(df_wind_speed, df_hcm_wind_speed, on = "datetime")
df_temperature = pd.merge(df_temperature, df_hcm_temp, on = "datetime")
df_weather_desc = pd.merge(df_weather_desc, df_hcm_weather_desc, on = "datetime")

Crawl data from 2023 to now

In [None]:
df_country = pd.read_csv("historical-hourly-weather-dataset/city_attributes.csv")

lst_city = list(df_country.City.unique())
lst_city.append("Ho Chi Minh City")

df_new_humidity = pd.DataFrame()
df_new_pressure = pd.DataFrame()
df_new_temperature = pd.DataFrame()
df_new_weather_desc = pd.DataFrame()
df_new_wind_dir = pd.DataFrame()
df_new_wind_speed = pd.DataFrame()

init = True

for city in lst_city:

    lst_dates = get_lst_first_day_last_day(datetime(2023, 1, 1), datetime(2023, 10, 1))

    humidity_arr = np.array([])
    time_arr = np.array([])
    wind_dir_arr = np.array([])
    wind_speed_arr = np.array([])
    pressure_arr = np.array([])
    temp_arr = np.array([])
    weather_desc_arr = np.array([], dtype = object)
    count = 0
    for date in lst_dates:
        print(city, count/ len(lst_dates) * 100)
        count += 1
        json = call_api_to_get_data(date[0], date[1], city)
        humidity_arr, time_arr, wind_dir_arr, wind_speed_arr, pressure_arr, weather_desc_arr , temp_arr = handle_append_data(
            json, humidity_arr, time_arr, wind_dir_arr, wind_speed_arr, pressure_arr, weather_desc_arr , temp_arr)

    df_city_humidity = pd.DataFrame({"datetime": time_arr, city: humidity_arr})
    df_city_pressure = pd.DataFrame({"datetime": time_arr, city: pressure_arr})
    df_city_wind_dir = pd.DataFrame({"datetime": time_arr, city: wind_dir_arr})
    df_city_wind_speed = pd.DataFrame({"datetime": time_arr, city: wind_speed_arr})
    df_city_temp= pd.DataFrame({"datetime": time_arr, city: temp_arr})
    df_city_weather_desc = pd.DataFrame({"datetime": time_arr , city: weather_desc_arr})
    if init:
        df_new_humidity = df_city_humidity.copy()
        df_new_pressure = df_city_pressure.copy()
        df_new_wind_dir = df_city_wind_dir.copy()
        df_new_wind_speed = df_city_wind_speed.copy()
        df_new_temperature = df_city_temp.copy()
        df_new_weather_desc = df_city_weather_desc.copy()
        init = False
    else :
        df_new_humidity = pd.merge(df_new_humidity, df_city_humidity, on = "datetime")
        df_new_pressure = pd.merge(df_new_pressure, df_city_pressure, on = "datetime")
        df_new_wind_dir = pd.merge(df_new_wind_dir, df_city_wind_dir, on = "datetime")
        df_new_wind_speed = pd.merge(df_new_wind_speed, df_city_wind_speed, on = "datetime")
        df_new_temperature = pd.merge(df_new_temperature, df_city_temp, on = "datetime")
        df_new_weather_desc = pd.merge(df_new_weather_desc, df_city_weather_desc, on = "datetime")

Merge into one DataFrame per attribute

In [None]:
df_humidity = pd.concat([df_humidity, df_new_humidity], axis = 0)
df_pressure = pd.concat([df_pressure, df_new_pressure], axis = 0)
df_wind_dir = pd.concat([df_wind_dir, df_new_wind_dir], axis = 0)
df_wind_speed = pd.concat([df_wind_speed, df_new_wind_speed], axis = 0)
df_temperature = pd.concat([df_temperature, df_new_temperature], axis = 0)
df_weather_desc = pd.concat([df_weather_desc, df_new_weather_desc], axis = 0)

Save to .csv files

In [None]:
df_humidity.to_csv("historical-hourly-weather-dataset/humidity.csv", index = False)
df_pressure.to_csv("historical-hourly-weather-dataset/pressure.csv", index = False)
df_wind_dir.to_csv("historical-hourly-weather-dataset/wind_direction.csv", index = False)
df_wind_speed.to_csv("historical-hourly-weather-dataset/wind_speed.csv", index = False)
df_temperature.to_csv("historical-hourly-weather-dataset/temperature.csv", index = False)
df_weather_desc.to_csv("historical-hourly-weather-dataset/weather_description.csv", index = False)

### Data cleaning & integration

Load data

In [None]:
weather_conditions_df = ps.read_csv(f'{DATASET_PATH}weather_description.csv')
humidity_df = ps.read_csv(f'{DATASET_PATH}humidity.csv')
pressure_df = ps.read_csv(f'{DATASET_PATH}pressure.csv')
temperature_df = ps.read_csv(f'{DATASET_PATH}temperature.csv')
city_attributes_df = ps.read_csv(f'{DATASET_PATH}city_attributes.csv')
wind_direction_df = ps.read_csv(f'{DATASET_PATH}wind_direction.csv')
wind_speed_df = ps.read_csv(f'{DATASET_PATH}wind_speed.csv')

In [None]:
DATETIME_COL = 'datetime'
HUMIDITY_COL = 'humidity'
PRESSURE_COL = 'pressure'
TEMPERATURE_COL = 'temperature'
WIND_DIRECTION_COL = 'wind_direction'
WIND_SPEED_COL = 'wind_speed'
LATITUDE_COL = 'latitude'
LONGITUDE_COL = 'longitude'
CITY_COL = 'city'
COUNTRY_COL = 'country'
WEATHER_CONDITION_COL = 'weather_condition'

Create a single `DataFrame` that includes all data from the others

In [None]:
def filter_dataframe_by_city_column(dataframe, city_name, new_column_name):
    return dataframe.to_spark() \
        .withColumn(new_column_name, col(city_name)) \
        .select([DATETIME_COL, new_column_name])
        

def join_dataframes(dataframes: List[DataFrame], column_name: str) -> DataFrame:
    joined_df = dataframes[0]

    for dataframe in dataframes[1:]:
        joined_df = joined_df.join(dataframe, [column_name])

    return joined_df

In [None]:
weather_measurements_df = None

# Iterate over all the records in the cities `DataFrame`
for index, row in city_attributes_df.iterrows():    

    city = row.City
    country = row.Country
    latitude = row.Latitude
    longitude = row.Longitude

    # Compute a list of `DataFrame`, one for each type of measurement in the city
    dataframes = [
        filter_dataframe_by_city_column(humidity_df, city, HUMIDITY_COL),
        filter_dataframe_by_city_column(pressure_df, city, PRESSURE_COL),
        filter_dataframe_by_city_column(temperature_df, city, TEMPERATURE_COL),
        filter_dataframe_by_city_column(wind_direction_df, city, WIND_DIRECTION_COL),
        filter_dataframe_by_city_column(wind_speed_df, city, WIND_SPEED_COL),
        filter_dataframe_by_city_column(weather_conditions_df, city, WEATHER_CONDITION_COL)
    ]

    # Compute a `DataFrame` that includes all the data about the measurements in the city
    joined_df = join_dataframes(dataframes, DATETIME_COL) \
        .withColumn(CITY_COL, lit(city)) \
        .withColumn(COUNTRY_COL, lit(country)) \
        .withColumn(LATITUDE_COL, lit(latitude)) \
        .withColumn(LONGITUDE_COL, lit(longitude))

    # Union the `DataFrame` with the ones computed in the previous iterations
    weather_measurements_df = weather_measurements_df.union(joined_df) if weather_measurements_df is not None else joined_df

In [None]:
weather_measurements_df.count()

Clean missing value

In [None]:
not_null_weather_measurements_df = weather_measurements_df.dropna()

### Label aggregation

In [None]:
def get_weather_conditions_aggregation_dict(weather_conditions):
    weather_conditions_dict = dict()
  
    for weather_condition in weather_conditions:
  
        weather_condition_lowered = weather_condition.lower()

        if any(key in weather_condition_lowered for key in ['squall', 'thunderstorm']):
            weather_conditions_dict[weather_condition] = 'thunderstorm'
        elif any(key in weather_condition_lowered for key in ['drizzle', 'rain']):
            weather_conditions_dict[weather_condition] = 'rainy'
        elif any(key in weather_condition_lowered for key in ['sleet', 'snow']):
            weather_conditions_dict[weather_condition] = 'snowy'
        elif any(key in weather_condition_lowered for key in ['smoke', 'cloud']):
            weather_conditions_dict[weather_condition] = 'cloudy'
        elif any(key in weather_condition_lowered for key in ['fog', 'mist', 'haze']):
            weather_conditions_dict[weather_condition] = 'foggy'
        elif any(key in weather_condition_lowered for key in ['clear', 'sun']):
            weather_conditions_dict[weather_condition] = 'sunny'
            
    return weather_conditions_dict

In [None]:
weather_conditions_all = not_null_weather_measurements_df \
    .select(col(WEATHER_CONDITION_COL)).distinct() \
    .toPandas()[WEATHER_CONDITION_COL].to_numpy().reshape(-1)

In [None]:
weather_conditions_dict = get_weather_conditions_aggregation_dict(weather_conditions_all)

Replace all the weather conditions in the `DataFrame` with the aggregated ones

In [None]:
weather_measurements_aggregated_df = not_null_weather_measurements_df.replace(weather_conditions_dict)

Remove all samples that contain other conditions

In [None]:
WEATHER_CONDITIONS = set(weather_conditions_dict.values())

weather_measurements_aggregated_df = weather_measurements_aggregated_df \
    .filter(weather_measurements_aggregated_df[WEATHER_CONDITION_COL].isin(WEATHER_CONDITIONS))

In [None]:
# Get all unique value from weather_measurements_aggregated_df in column weather_condition
weather_conditions = weather_measurements_aggregated_df \
    .select(col(WEATHER_CONDITION_COL)).distinct() \
    .toPandas()[WEATHER_CONDITION_COL].to_numpy().reshape(-1)
    
print(weather_conditions)

### Undersampling

In [None]:
def count_weather_condition_occurrences(dataframe: DataFrame, class_name: str) -> int:
    return dataframe.filter(dataframe[WEATHER_CONDITION_COL] == class_name).count()

def get_undersampling_fracs(dataframe: DataFrame) -> Dict[str, float]:
    rainy_cnt = count_weather_condition_occurrences(dataframe, 'rainy')
    snowy_cnt = count_weather_condition_occurrences(dataframe, 'snowy')
    sunny_cnt = count_weather_condition_occurrences(dataframe, 'sunny')
    foggy_cnt = count_weather_condition_occurrences(dataframe, 'foggy')
    cloudy_cnt = count_weather_condition_occurrences(dataframe, 'cloudy')
    thunderstorm_cnt = count_weather_condition_occurrences(dataframe, 'thunderstorm')

    minority_class_cnt = np.min([rainy_cnt, snowy_cnt, sunny_cnt, cloudy_cnt, foggy_cnt, thunderstorm_cnt])

    return {
        'rainy': minority_class_cnt / rainy_cnt,
        'snowy': minority_class_cnt / snowy_cnt,
        'sunny': minority_class_cnt / sunny_cnt,
        'foggy': minority_class_cnt / foggy_cnt,
        'cloudy': minority_class_cnt / cloudy_cnt,
        'thunderstorm': minority_class_cnt / thunderstorm_cnt
    }

In [None]:
sampled_weather_measurements_df = not_null_weather_measurements_df.sampleBy(
    WEATHER_CONDITION_COL,
    fractions = get_undersampling_fracs(not_null_weather_measurements_df),
    seed = 42)

Save the undersampled dataset

In [None]:
sampled_weather_measurements_df.write.csv('tmp',
    mode = 'overwrite',
    header = True)

### Data statistics

Load data

In [None]:
# Get all the csv files in the aggregated dataset folder
csv_files = [file for file in os.listdir(AGGREGATED_DATASET_PATH) if file.endswith('.csv')]

# Read each CSV file into a Koalas DataFrame and store them in a list
dfs = [ps.read_csv(os.path.join(AGGREGATED_DATASET_PATH, file)) for file in csv_files]

# Combine the DataFrames using the concat function
data = ps.concat(dfs, ignore_index = True)

In [None]:
features = [
    'humidity',
    'pressure',
    'temperature',
    'wind_direction',
    'wind_speed',
    'latitude',
    'longitude',
]

In [None]:
len(data)

In [None]:
data.head()

Descriptive statistics

In [None]:
data[features].describe().transpose()

Boxplot

In [None]:
features = ['humidity', 'pressure', 'temperature']

# Plot the data
fig, axs = plt.subplots(1, 3, figsize = (15, 5))

axs[0].boxplot(data[features[0]])
axs[0].set_ylabel('value')
axs[0].set_title(features[0], fontweight = "bold")

axs[1].boxplot(data[features[1]])
axs[1].set_ylabel('value')
axs[1].set_title(features[1], fontweight = "bold")

axs[2].boxplot(data[features[2]])
axs[2].set_ylabel('value')
axs[2].set_title(features[2], fontweight = "bold")

plt.show()

Distribution

In [None]:
features = ['humidity', 'pressure', 'temperature']
f1_list = data[features[0]].tolist()
f2_list = data[features[1]].tolist()
f3_list = data[features[2]].tolist()

# Plot the data
fig, axs = plt.subplots(1, 3, figsize = (15, 5))

axs[0].hist2d(f1_list, f2_list, bins = (50, 50), vmax = 200)
axs[0].set_xlabel(features[0])
axs[0].set_ylabel(features[1])
axs[0].set_title(features[0] + ' vs ' + features[1], fontweight = "bold")

axs[1].hist2d(f1_list, f3_list, bins = (50, 50), vmax = 200)
axs[1].set_xlabel(features[0])
axs[1].set_ylabel(features[2])
axs[1].set_title(features[0] + ' vs ' + features[2], fontweight = "bold")

axs[2].hist2d(f2_list, f3_list, bins = (50, 50), vmax = 200)
axs[2].set_xlabel(features[1])
axs[2].set_ylabel(features[2])
axs[2].set_title(features[1] + ' vs ' + features[2], fontweight = "bold")

plt.show()

# Machine Learning pipeline

### Import libraries

In [None]:
import os
import requests
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

import findspark
findspark.init()
findspark.find() 

import pyspark.pandas as ps
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler, IndexToString
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, DecisionTreeClassifier, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [None]:
DATASET_PATH = 'datasets/historical-hourly-weather-dataset/'
AGGREGATED_DATASET_PATH = 'datasets/historical-hourly-weather-dataset/aggregated_sampled_weather_measurements'

### Load data

In [None]:
# Get all the csv files in the aggregated dataset folder
csv_files = [file for file in os.listdir(AGGREGATED_DATASET_PATH) if file.endswith('.csv')]

# Read each CSV file into a Koalas DataFrame and store them in a list
dfs = [ps.read_csv(os.path.join(AGGREGATED_DATASET_PATH, file)) for file in csv_files]

# Combine the DataFrames using the concat function
data = ps.concat(dfs, ignore_index = True)

### Pre-processing

Select relevant features and label column

In [None]:
# Select relevant features
numerical_cols = [
    'humidity',
    'pressure',
    'temperature',
    'wind_direction',
    'wind_speed',
    'latitude',
    'longitude'
]
nominal_cols = []
# Select the label column
label_col = 'weather_condition'
prediction_col = 'predicted_weather_condition'
# Select the features and the label
df_selected = data[numerical_cols + nominal_cols + [label_col]]

In [None]:
df_selected.head()

In [None]:
len(df_selected)

In [None]:
# Get number of samples for each weather condition
tmp = df_selected[label_col].value_counts()
values = []
for i in range(len(tmp)):
    values.append(tmp[i])
    
# Names for each bar
bar_names = ['sunny', 'cloudy', 'snowy', 'foggy', 'thunderstorm', 'rainy']

# Bar plot with custom names
plt.bar(bar_names, values, color = 'blue')

# Adding labels and title
plt.xlabel('Labels')
plt.ylabel('Number of sampples')

plt.savefig('bar_plot.png')

# Display the plot
plt.show()

In [None]:
# Convert the Koalas DataFrame to a Spark DataFrame
df_selected = df_selected.to_spark()

Train-Test split

In [None]:
train_data, test_data = df_selected.randomSplit([0.8, 0.2], seed = 42)

Encode

In [None]:
def encode(
    df,
    numerical_cols = [],
    nominal_cols = [],
    label_col = '',
    with_std = True,
    with_mean = True,
):
    # Convert categorical label to numerical label
    label_indexer = StringIndexer(
        inputCol = label_col,
        outputCol = 'label',
        handleInvalid = 'keep'
    )
    
    # Assemble features into a vector
    feature_cols = numerical_cols + nominal_cols
    vector_assembler = VectorAssembler(
        inputCols = feature_cols,
        outputCol = 'raw_features'
    )
    
    # Scale the features
    scaler = StandardScaler(
        inputCol = 'raw_features',
        outputCol = 'scaled_features',
        withStd = with_std,
        withMean = with_mean
    )
    
    stages = [label_indexer, vector_assembler, scaler]
    pipeline = Pipeline(stages = stages)
    
    transformer = pipeline.fit(df)
    
    return transformer

In [None]:
data_encoder = encode(
    df = df_selected,
    numerical_cols = numerical_cols,
    nominal_cols = nominal_cols,
    label_col = label_col
)

# Weather Forecasting models

In [None]:
"""
    Support functions for training and evaluating the model
"""

# Evaluate the model
def evaluate(predictions):
    accuracy = MulticlassClassificationEvaluator(
        labelCol = 'label',
        predictionCol = 'prediction',
        metricName = 'accuracy'
    )
    precision = MulticlassClassificationEvaluator(
        labelCol = 'label',
        predictionCol = 'prediction',
        metricName = 'weightedPrecision'
    )
    recall = MulticlassClassificationEvaluator(
        labelCol = 'label',
        predictionCol = 'prediction',
        metricName = 'weightedRecall'
    )
    f1 = MulticlassClassificationEvaluator(
        labelCol = 'label',
        predictionCol = 'prediction',
        metricName = 'f1'
    )
    
    print('Accuracy:', accuracy.evaluate(predictions))
    print('Precision:', precision.evaluate(predictions))
    print('Recall:', recall.evaluate(predictions))
    print('F1:', f1.evaluate(predictions))
    
# Confusion matrix
def plot_confusion_matrix(predictions_df, normalize: bool = False, title: str = None): 
    labels = predictions_df.select(label_col).to_koalas().to_numpy()
    pedictions = predictions_df.select(prediction_col).to_koalas().to_numpy()

    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(labels, pedictions, labels=None)
    
    # Only use the labels that appear in the data
    classes = unique_labels(labels, pedictions)

    if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap = plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,           
           xlabel='Predicted label', ylabel='True label',
           title=title)

    ax.set_ylim(len(classes) - 0.5, -0.5)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation = 45, ha = 'right', rotation_mode = 'anchor')

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(
                j, i, format(cm[i, j], fmt), ha = 'center', va = 'center',
                color = 'white' if cm[i, j] > thresh else 'black'
            )

    fig.tight_layout()
    plt.show()
    
# Index to string
idx_to_str = IndexToString(
    inputCol = 'prediction',
    outputCol = prediction_col,
    labels = data_encoder.stages[0].labels
)

### Random Forest

Define the classifier

In [None]:
classifier = RandomForestClassifier(
    featuresCol = 'scaled_features',
    labelCol = 'label',
    numTrees = 10,
    maxDepth = 50,
)

Define the pipeline with the encoding and classifier stages

In [None]:
pipeline = Pipeline(stages = [data_encoder, classifier])

Define the evaluator

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol = 'label',
    predictionCol = 'prediction',
    metricName = 'accuracy'
)

Define hyperparameter tuning (optional)

In [None]:
# Define the grid of hyperparameters
param_grid = ParamGridBuilder().build()

# Set up the cross validator for model training and hyperparameter tuning
cross_validator = CrossValidator(
    estimator = pipeline,
    estimatorParamMaps = param_grid,
    evaluator = evaluator,
    numFolds = 5
)

Fit the model using the training data

In [None]:
model = cross_validator.fit(train_data)

Predict and evaluate on test data

In [None]:
predictions = model.transform(test_data)

In [None]:
evaluate(predictions)

In [None]:
predictions = idx_to_str.transform(predictions)

Confusion matrix

In [None]:
plot_confusion_matrix(predictions, normalize = True)

### Logistic Regression

Define the classifier

In [None]:
classifier = LogisticRegression(
    featuresCol = 'scaled_features',
    labelCol = 'label',
    maxIter = 1000,
    regParam = 0.0,
    elasticNetParam = 0.0
)

Define the pipeline with the encoding and classifier stages

In [None]:
pipeline = Pipeline(stages = [data_encoder, classifier])

Define the evaluator

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol = 'label',
    predictionCol = 'prediction',
    metricName = 'accuracy'
)

Define hyperparameter tuning (optional)

In [None]:
# Define the grid of hyperparameters
param_grid = ParamGridBuilder().build()

# Set up the cross validator for model training and hyperparameter tuning
cross_validator = CrossValidator(
    estimator = pipeline,
    estimatorParamMaps = param_grid,
    evaluator = evaluator,
    numFolds = 5
)

Fit the model using the training data

In [None]:
model = cross_validator.fit(train_data)

Predict and evaluate on test data

In [None]:
predictions = model.transform(test_data)

In [None]:
evaluate(predictions)

In [None]:
predictions = idx_to_str.transform(predictions)

Confusion matrix

In [None]:
plot_confusion_matrix(predictions, normalize = True)

### Decision Tree

Define the classifier

In [None]:
classifier = DecisionTreeClassifier(
    featuresCol = 'scaled_features',
    labelCol = 'label',
    maxDepth = 50
)

Define the pipeline with the encoding and classifier stages

In [None]:
pipeline = Pipeline(stages = [data_encoder, classifier])

Define the evaluator

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol = 'label',
    predictionCol = 'prediction',
    metricName = 'accuracy'
)

Define hyperparameter tuning (optional)

In [None]:
# Define the grid of hyperparameters
param_grid = ParamGridBuilder().build()

# Set up the cross validator for model training and hyperparameter tuning
cross_validator = CrossValidator(
    estimator = pipeline,
    estimatorParamMaps = param_grid,
    evaluator = evaluator,
    numFolds = 5
)

Fit the model using the training data

In [None]:
model = cross_validator.fit(train_data)

Predict and evaluate on test data

In [None]:
predictions = model.transform(test_data)

In [None]:
evaluate(predictions)

In [None]:
predictions = idx_to_str.transform(predictions)

Confusion matrix

In [None]:
plot_confusion_matrix(predictions, normalize = True)

### Multilayer Perceptron

Define the layers of the neural network

In [None]:
layers = [len(numerical_cols) + len(nominal_cols), 32, 64, 128, 6]

Define the classifier

In [None]:
classifier = MultilayerPerceptronClassifier(
    featuresCol = 'scaled_features',
    labelCol = 'label',
    maxIter = 500,
    layers = layers,
    blockSize = 128,
    seed = 42,
)

Define the pipeline with the encoding and classifier stages

In [None]:
pipeline = Pipeline(stages = [data_encoder, classifier])

Define the evaluator

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol = 'label',
    predictionCol = 'prediction',
    metricName = 'accuracy'
)

Define hyperparameter tuning (optional)

In [None]:
# Define the grid of hyperparameters
param_grid = ParamGridBuilder().build()

# Set up the cross validator for model training and hyperparameter tuning
cross_validator = CrossValidator(
    estimator = pipeline,
    estimatorParamMaps = param_grid,
    evaluator = evaluator,
    numFolds = 5
)

Fit the model using the training data

In [None]:
model = cross_validator.fit(train_data)

Predict and evaluate on test data

In [None]:
predictions = model.transform(test_data)

In [None]:
evaluate(predictions)

In [None]:
predictions = idx_to_str.transform(predictions)

Confusion matrix

In [None]:
plot_confusion_matrix(predictions, normalize = True)

# Comparison with [OpenWeather](https://openweathermap.org/) forecasts

In [None]:
def handle_description(desc: str):
    weather_condition_lowered = desc.lower()

    if any(key in weather_condition_lowered for key in ['squall', 'thunderstorm']):
        desc = 'thunderstorm'
    elif any(key in weather_condition_lowered for key in ['drizzle', 'rain', "rainy"]):
        desc = 'rainy'
    elif any(key in weather_condition_lowered for key in ['sleet', 'snow', "snowy"]):
        desc = 'snowy'
    elif any(key in weather_condition_lowered for key in ['cloud', "overcast", "cloudy", "clouds"]):
        desc = 'cloudy'
    elif any(key in weather_condition_lowered for key in ['fog', 'mist', 'haze', "smoke", "dust", "foggy"]):
        desc = 'foggy'
    elif any(key in weather_condition_lowered for key in ['clear', 'sun', "sunny", ]):
        desc = 'sunny'
    return desc

def handle_append_data(json, humidity_arr, time_arr, wind_dir_arr, wind_speed_arr, pressure_arr, weather_desc_arr , temp_arr):

    for day_data in json["data"]["weather"]:

        date = day_data["date"]
        for hour_data in day_data['hourly']:
            time = "{date} {time}:00:00".format(date = date, time = int(int(hour_data["time"])/100))
            wind_speed_ms = round(float(hour_data["windspeedKmph"])* (10/36),2)
            wind_dir = int(hour_data["winddirDegree"])
            humidity = int(hour_data["humidity"])
            pressure = int(hour_data["pressure"])
            tempK = int(hour_data["tempC"]) + 273.15
            weather_desc = hour_data["weatherDesc"][0]["value"]
            weather_desc = handle_description(weather_desc)

            time_arr = np.append(time_arr, time)
            humidity_arr = np.append(humidity_arr ,  humidity)
            wind_speed_arr = np.append(wind_speed_arr, wind_speed_ms)
            wind_dir_arr = np.append(wind_dir_arr, wind_dir)
            pressure_arr = np.append(pressure_arr, pressure)
            temp_arr = np.append(temp_arr, tempK)
            weather_desc_arr = np.append(weather_desc_arr, weather_desc)

    return humidity_arr, time_arr, wind_dir_arr, wind_speed_arr, pressure_arr, weather_desc_arr , temp_arr

def get_7_days_forecasts_df(lng = 10.762622,lat = 106.660172, city_name = "Ho Chi Minh City", api_key = 'c360b29aaa5a41a894b130731230511'):
    link = "http://api.worldweatheronline.com/premium/v1/marine.ashx?q={lng},{lat}&key={api_key}&format=json&tp=3".format(
        lng = lng,
        lat = lat,
        api_key = api_key,
    )
    res = requests.get(link).json()

    humidity_arr = np.array([])
    time_arr = np.array([])
    wind_dir_arr = np.array([])
    wind_speed_arr = np.array([])
    pressure_arr = np.array([])
    temp_arr = np.array([])
    weather_desc_arr = np.array([], dtype = object)

    humidity_arr, time_arr, wind_dir_arr, wind_speed_arr, pressure_arr, weather_desc_arr , temp_arr = handle_append_data(
            res, humidity_arr, time_arr, wind_dir_arr, wind_speed_arr, pressure_arr, weather_desc_arr , temp_arr)
    df = ps.DataFrame({'datetime': time_arr, 'humidity': humidity_arr,'temperature': temp_arr,'pressure': pressure_arr,'wind_direction': wind_dir_arr,'wind_speed': wind_speed_arr,'weather_condition': weather_desc_arr,'city': city_name,'latitude':float(lat),'longitude':float(lng)})
    return df

In [None]:
worldweather_forecast = get_7_days_forecasts_df(49.24966, 123.119339, "Vancouver")

In [None]:
df_test = worldweather_forecast[numerical_cols + nominal_cols + [label_col]]

In [None]:
df_test = df_selected.to_spark()

In [None]:
openweather_forecasts_predictions = model.transform(df_test)

In [None]:
predictions = idx_to_str.transform(openweather_forecasts_predictions)

In [None]:
evaluate(predictions)

In [None]:
plot_confusion_matrix(predictions, normalize = True)