# Weather forecasting with PySpark
## Big Data Computing final project

###Define some global constants

In [None]:
DATASET_PATH: str = 'dbfs:/bigdata_proj/datasets/historical-hourly-weather/'
MODELS_PATH: str = 'dbfs:/bigdata_proj/models/historical-hourly-weather/'

RANDOM_SEED: int = 42
  
SLOW_OPERATIONS: bool = True
  
# True to save the computation of datataset preprocessing, fitted pipelines and trained models to the filesystem
SAVE_COMPUTATIONS: bool = True
  
# True to load the sampled dataset from the filesystem, False to compute it from the raw one
LOAD_SAMPLED_DATASET: bool = True
SAMPLED_DATASET_PATH: str = f'{DATASET_PATH}aggregated_sampled_weather_measurements.csv'
  
# True to load the encoding pipeline from the filesystem, False to compute it from scratch
LOAD_ECONDING_PIPELINE: bool = True
ENCODING_PIPELINE_PATH: str = f'{MODELS_PATH}data_encoder'

# True to load pretrained models from the filesystem, False to compute them from scratch
LOAD_PRETRAINED_MODELS: bool = True
RANDOM_FOREST_MODEL_PATH: str = f'{MODELS_PATH}rnd_forest'
RANDOM_FOREST_CROSS_VALIDATION_MODEL_PATH: str = f'{MODELS_PATH}rnd_forest_cv'
LOGISTIC_REGRESSION_CROSS_VALIDATION_MODEL_PATH: str = f'{MODELS_PATH}log_reg_cv'

# necessary due to DataBricks community edition limits (training on a dataframe larger than this threshold causes an Internal Server Error)
MAX_TRAIN_SIZE: int = 999_999

###Import PySpark packages and other dependencies

In [None]:
import pyspark
import numpy as np
import matplotlib.pyplot as plt

from pyspark import SparkContext, SparkConf
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.tuning import CrossValidatorModel

from typing import *

## Dataset initialization

### Download the dataset
Original source: [kaggle.com/selfishgene/historical-hourly-weather-data](https://www.kaggle.com/selfishgene/historical-hourly-weather-data)

In [None]:
%sh
wget --no-verbose https://github.com/andrea-gasparini/big-data-weather-forecasting/raw/master/dataset/historical-hourly-weather-dataset.zip -O /tmp/dataset.zip
unzip -u /tmp/dataset.zip -d /tmp/dataset

2024-03-16 17:05:24 URL:https://raw.githubusercontent.com/andrea-gasparini/big-data-weather-forecasting/master/dataset/historical-hourly-weather-dataset.zip [12655281/12655281] -> "/tmp/dataset.zip" [1]


Archive:  /tmp/dataset.zip
   creating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/._committed_7616641238230246128.crc  
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/.part-00000-tid-5785058191842647654-99694b27-5637-4d82-97fd-79413e3b2b1a-5515-1-c000.csv.crc  
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/.part-00006-tid-5785058191842647654-99694b27-5637-4d82-97fd-79413e3b2b1a-5521-1-c000.csv.crc  
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/.part-00002-tid-5785058191842647654-99694b27-5637-4d82-97fd-79413e3b2b1a-5517-1-c000.csv.crc  
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/.part-00004-tid-5785058191842647654-99694b27-5637-4d82-97fd-79413e3b2b1a-5519-1-c000.csv.crc  
  inflating: /tmp/dataset/aggregated_sampled_weather_measurements.csv/part-00003-tid-5785058191842647654-99694b27-5637-4d82-97fd-79413

###Move the dataset from Databricks local driver node's file system to DBFS

In [None]:
for file in dbutils.fs.ls('file:/tmp/dataset'):
    dbutils.fs.mv(file.path, f'{DATASET_PATH}{file.name}', recurse=True)

In [None]:
%fs ls /bigdata_proj/datasets/historical-hourly-weather

path,name,size,modificationTime
dbfs:/bigdata_proj/datasets/historical-hourly-weather/aggregated_sampled_weather_measurements.csv/,aggregated_sampled_weather_measurements.csv/,0,1710607244000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/city_attributes.csv,city_attributes.csv,1614,1710608726000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/humidity.csv,humidity.csv,9075077,1710608725000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/pressure.csv,pressure.csv,12155911,1710608726000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/temperature.csv,temperature.csv,13971171,1710608726000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/weather_description.csv,weather_description.csv,21858089,1710608728000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/wind_direction.csv,wind_direction.csv,10171003,1710608726000
dbfs:/bigdata_proj/datasets/historical-hourly-weather/wind_speed.csv,wind_speed.csv,7457531,1710608728000


###Load dataset into Spark DataFrame objects

### Dataset shape and schema
The raw dataset downloaded from kaggle is composed of 7 different `.csv` files:
- `city_attributes.csv` contains geographical information about the different cities for which there are weather measurements
- `weather_description.csv` contains the textual description of the weather conditions, where each column refers to a different city and each row refers to a specific `datetime` in which the weather condition occurred
- Each one of the other 5 csv follows the same structure as `weather_description.csv` and contains the measurements of the following metrics: `humidity`,  `pressure`, `temperature`, `wind_direction`, `wind_speed`

Except for `city_attributes.csv`, all the other files contains about **45.000** records of hourly weather measurements, that multiplied by the **36** cities results in approximately **1.500.000** records.

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# rest of your code
weather_conditions_df = spark.read.csv(f'{DATASET_PATH}weather_description.csv', header=True, inferSchema=True)
humidity_df = spark.read.csv(f'{DATASET_PATH}humidity.csv', header=True, inferSchema=True)
pressure_df = spark.read.csv(f'{DATASET_PATH}pressure.csv', header=True, inferSchema=True)
temperature_df = spark.read.csv(f'{DATASET_PATH}temperature.csv', header=True, inferSchema=True)
city_attributes_df = spark.read.csv(f'{DATASET_PATH}city_attributes.csv', header=True, inferSchema=True)
wind_direction_df = spark.read.csv(f'{DATASET_PATH}wind_direction.csv', header=True, inferSchema=True)
wind_speed_df = spark.read.csv(f'{DATASET_PATH}wind_speed.csv', header=True, inferSchema=True)

**Trying to print the shape and see whats going on**

In [None]:
def print_dataframe_shape(dataframe: DataFrame) -> None:
    rows_count = dataframe.count()
    columns_count = len(dataframe.columns)
    print(f'The shape of the dataset is {rows_count} rows by {columns_count} columns', end='\n\n')

**Using the function above for city_attributes.csv, Additonally pringing the shema as well**

In [None]:
print_dataframe_shape(city_attributes_df)

The shape of the dataset is 36 rows by 4 columns



In [None]:
city_attributes_df.printSchema()

root
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)



In [None]:
city_attributes_df.head(5)

[Row(City='Vancouver', Country='Canada', Latitude=49.24966, Longitude=-123.119339),
 Row(City='Portland', Country='United States', Latitude=45.523449, Longitude=-122.676208),
 Row(City='San Francisco', Country='United States', Latitude=37.774929, Longitude=-122.419418),
 Row(City='Seattle', Country='United States', Latitude=47.606209, Longitude=-122.332069),
 Row(City='Los Angeles', Country='United States', Latitude=34.052231, Longitude=-118.243683)]

**Using the function print_data_frame we made above for weather_description.csv, Additonally pringing the shema as well.**

In [None]:
print_dataframe_shape(weather_conditions_df)

The shape of the dataset is 45253 rows by 37 columns



In [None]:
weather_conditions_df.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- Vancouver: string (nullable = true)
 |-- Portland: string (nullable = true)
 |-- San Francisco: string (nullable = true)
 |-- Seattle: string (nullable = true)
 |-- Los Angeles: string (nullable = true)
 |-- San Diego: string (nullable = true)
 |-- Las Vegas: string (nullable = true)
 |-- Phoenix: string (nullable = true)
 |-- Albuquerque: string (nullable = true)
 |-- Denver: string (nullable = true)
 |-- San Antonio: string (nullable = true)
 |-- Dallas: string (nullable = true)
 |-- Houston: string (nullable = true)
 |-- Kansas City: string (nullable = true)
 |-- Minneapolis: string (nullable = true)
 |-- Saint Louis: string (nullable = true)
 |-- Chicago: string (nullable = true)
 |-- Nashville: string (nullable = true)
 |-- Indianapolis: string (nullable = true)
 |-- Atlanta: string (nullable = true)
 |-- Detroit: string (nullable = true)
 |-- Jacksonville: string (nullable = true)
 |-- Charlotte: string (nullable = true)
 |-- M

In [None]:
weather_conditions_df[weather_conditions_df.columns[:5]].head(5)

[Row(datetime=datetime.datetime(2012, 10, 1, 12, 0), Vancouver=None, Portland=None, San Francisco=None, Seattle=None),
 Row(datetime=datetime.datetime(2012, 10, 1, 13, 0), Vancouver='mist', Portland='scattered clouds', San Francisco='light rain', Seattle='sky is clear'),
 Row(datetime=datetime.datetime(2012, 10, 1, 14, 0), Vancouver='broken clouds', Portland='scattered clouds', San Francisco='sky is clear', Seattle='sky is clear'),
 Row(datetime=datetime.datetime(2012, 10, 1, 15, 0), Vancouver='broken clouds', Portland='scattered clouds', San Francisco='sky is clear', Seattle='sky is clear'),
 Row(datetime=datetime.datetime(2012, 10, 1, 16, 0), Vancouver='broken clouds', Portland='scattered clouds', San Francisco='sky is clear', Seattle='sky is clear')]

# Dataset preprocessing Starts

### For Columns
Setting some alias for colums to standardize throughout the code further

In [None]:
DATETIME_COL = 'datetime'
HUMIDITY_COL = 'humidity'
PRESSURE_COL = 'pressure'
TEMPERATURE_COL = 'temperature'
WIND_DIRECTION_COL = 'wind_direction'
WIND_SPEED_COL = 'wind_speed'
LATITUDE_COL = 'latitude'
LONGITUDE_COL = 'longitude'
CITY_COL = 'city'
COUNTRY_COL = 'country'
WEATHER_CONDITION_COL = 'weather_condition'

**Create a single DataFrame that includes all data from the others**

Reason to do this : The as we have seen from the shapes above, the dataset needs to be made suitable for Machine Learning purposes. The bese solution here is to use a single DataFrame object which includes all the information. Here we can have one column each for each metric, maybe some columns for the information such as city, geographical positioning etc and lastely one column for weather condition.

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

def filter_dataframe_by_city_column(dataframe: DataFrame,
                                    city_name: str,
                                    new_column_name: str) -> DataFrame:
    '''
    Args:
        - dataframe: a `DataFrame` with a datetime column and n cities columns,
                     where the records are the related hourly measurements
        - city_name: city name between the ones in the dataframe
        - new_column_name: name to replace the city name
        
    Returns: 
        a new `DataFrame` with:
            - the datetime column
            - a single column of measurements related to the `city_name`
              and renamed as `new_column_name`
    '''
    return dataframe.withColumn(new_column_name, col(city_name)) \
                    .select([DATETIME_COL, new_column_name])

**performing the join operation**

In [None]:
def join_dataframes(dataframes: List[DataFrame], column_name: str) -> DataFrame:
    '''
    Args:
        - dataframse: a list of `DataFrame` to be joined
        - column_name: the column over which the records should be joined
        
    Returns:
        a new dataframes resulting from the join of all the dataframes
        over the `column_name` column
    '''
    joined_df = dataframes[0]

    for dataframe in dataframes[1:]:
        joined_df = joined_df.join(dataframe, [column_name])

    return joined_df

**Combine Weather Measurements with City Attribute**

This code segment iterates over each city in a DataFrame (city_attributes_df), filters several DataFrames containing weather measurements based on the city, and then joins them together while adding city attributes as columns. Finally, it aggregates these DataFrames into a main DataFrame (weather_measurements_df)

In [None]:
from pyspark.sql import Row

# Initialize the main DataFrame to store weather measurements
weather_measurements_df = None

# Collect city attributes as a list
city_attributes_list = city_attributes_df.collect()

# Iterate over each city and its attributes
for row in city_attributes_list:
    # Extract attributes for the current city
    city = row.City
    country = row.Country
    latitude = row.Latitude
    longitude = row.Longitude

    # Filter dataframes for each weather measurement by city
    dataframes = [
        filter_dataframe_by_city_column(humidity_df, city, HUMIDITY_COL),
        filter_dataframe_by_city_column(pressure_df, city, PRESSURE_COL),
        filter_dataframe_by_city_column(temperature_df, city, TEMPERATURE_COL),
        filter_dataframe_by_city_column(wind_direction_df, city, WIND_DIRECTION_COL),
        filter_dataframe_by_city_column(wind_speed_df, city, WIND_SPEED_COL),
        filter_dataframe_by_city_column(weather_conditions_df, city, WEATHER_CONDITION_COL)
    ]

    # Join filtered dataframes based on datetime column and add city attributes as columns
    joined_df = join_dataframes(dataframes, DATETIME_COL) \
        .withColumn(CITY_COL, lit(city)) \
        .withColumn(COUNTRY_COL, lit(country)) \
        .withColumn(LATITUDE_COL, lit(latitude)) \
        .withColumn(LONGITUDE_COL, lit(longitude))

    # Aggregate the DataFrames computed for each city into the main DataFrame 
    # by appending them iteratively, ensuring all city measurements are combined.
    weather_measurements_df = weather_measurements_df.union(joined_df) if weather_measurements_df is not None else joined_df


In [None]:
# Print the shape (number of rows and columns) of the weather_measurements_df DataFrame
print_dataframe_shape(weather_measurements_df)

The shape of the dataset is 1629108 rows by 11 columns



In [None]:
# Print the schema of the weather_measurements_df DataFrame
weather_measurements_df.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- humidity: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- wind_direction: double (nullable = true)
 |-- wind_speed: double (nullable = true)
 |-- weather_condition: string (nullable = true)
 |-- city: string (nullable = false)
 |-- country: string (nullable = false)
 |-- latitude: double (nullable = false)
 |-- longitude: double (nullable = false)



In [None]:
# Check if SLOW_OPERATIONS flag is set to True before displaying the first 5 rows of the DataFrame
if SLOW_OPERATIONS:
    weather_measurements_df.show(5)

+-------------------+--------+--------+-------------+--------------+----------+-----------------+---------+-------+--------+-----------+
|           datetime|humidity|pressure|  temperature|wind_direction|wind_speed|weather_condition|     city|country|latitude|  longitude|
+-------------------+--------+--------+-------------+--------------+----------+-----------------+---------+-------+--------+-----------+
|2012-10-01 12:00:00|    NULL|    NULL|         NULL|          NULL|      NULL|             NULL|Vancouver| Canada|49.24966|-123.119339|
|2012-10-01 13:00:00|    76.0|    NULL|       284.63|           0.0|       0.0|             mist|Vancouver| Canada|49.24966|-123.119339|
|2012-10-01 14:00:00|    76.0|    NULL| 284.62904131|           6.0|       0.0|    broken clouds|Vancouver| Canada|49.24966|-123.119339|
|2012-10-01 15:00:00|    76.0|    NULL|284.626997923|          20.0|       0.0|    broken clouds|Vancouver| Canada|49.24966|-123.119339|
|2012-10-01 16:00:00|    77.0|    NULL|28

In [None]:
# Check if SLOW_OPERATIONS flag is set to True before describing the DataFrame using Koalas
if SLOW_OPERATIONS:
    weather_measurements_df.describe().to_koalas().transpose()

In [None]:
# Check if SLOW_OPERATIONS flag is set to True before counting missing values for each column
if SLOW_OPERATIONS:
    for c in weather_measurements_df.columns:
        print(f'Missing values of column `{c}` count: {weather_measurements_df.where(col(c).isNull()).count()}')


Missing values of column `datetime` count: 0
Missing values of column `humidity` count: 28651
Missing values of column `pressure` count: 16680
Missing values of column `temperature` count: 8030
Missing values of column `wind_direction` count: 7975
Missing values of column `wind_speed` count: 7993
Missing values of column `weather_condition` count: 7955
Missing values of column `city` count: 0
Missing values of column `country` count: 0
Missing values of column `latitude` count: 0
Missing values of column `longitude` count: 0


In [None]:
# Create a new DataFrame excluding rows with null values
not_null_weather_measurements_df = weather_measurements_df.dropna()

In [None]:
# Check if SLOW_OPERATIONS flag is set to True before grouping by weather conditions and counting occurrences
if SLOW_OPERATIONS:
    not_null_weather_measurements_df.groupBy(WEATHER_CONDITION_COL).count().show(truncate=False)

     

+----------------------------+------+
|weather_condition           |count |
+----------------------------+------+
|fog                         |16185 |
|very heavy rain             |1001  |
|proximity shower rain       |2339  |
|few clouds                  |133685|
|heavy shower snow           |336   |
|light rain                  |127364|
|light intensity drizzle     |8048  |
|light intensity shower rain |3633  |
|broken clouds               |167102|
|overcast clouds             |133778|
|light snow                  |14368 |
|scattered clouds            |143277|
|thunderstorm with heavy rain|396   |
|thunderstorm with light rain|1179  |
|heavy intensity rain        |14075 |
|moderate rain               |43172 |
|light intensity drizzle rain|41    |
|sky is clear                |641577|
|snow                        |3156  |
|light shower snow           |998   |
+----------------------------+------+
only showing top 20 rows



**Categorizing Weather Conditions**

This function categorizes a collection of string weather conditions into broader categories such as thunderstorm, rainy, snowy, cloudy, foggy, or sunny. It iterates through each weather condition, converting them to lowercase for case-insensitive matching. Based on keywords present in each condition, it assigns them to one of the predefined categories. Finally, it returns a dictionary mapping each original weather condition to its corresponding category.

In [None]:
def get_weather_conditions_aggregation_dict(weather_conditions: Iterable[str]) -> Dict[str, str]:
    '''
    Args:
        - weather_conditions: an iterable collection of string weather conditions to be aggregated

    Returns:
        a dictionary that maps from the original weather condition name to one of the following categories:
            - thunderstorm
            - rainy
            - snowy
            - cloudy
            - foggy
            - sunny
    '''
    
    # Initialize an empty dictionary to store aggregated weather conditions
    weather_conditions_dict = dict()
  
    # Iterate over each weather condition
    for weather_condition in weather_conditions:
  
        # Convert weather condition to lowercase for case-insensitive matching
        weather_condition_lowered = weather_condition.lower()

        # Check for keywords in weather condition to assign category
        if any(key in weather_condition_lowered for key in ['squall', 'thunderstorm']):
            weather_conditions_dict[weather_condition] = 'thunderstorm'
        elif any(key in weather_condition_lowered for key in ['drizzle', 'rain']):
            weather_conditions_dict[weather_condition] = 'rainy'
        elif any(key in weather_condition_lowered for key in ['sleet', 'snow']):
            weather_conditions_dict[weather_condition] = 'snowy'
        elif 'cloud' in weather_condition_lowered:
            weather_conditions_dict[weather_condition] = 'cloudy'
        elif any(key in weather_condition_lowered for key in ['fog', 'mist', 'haze']):
            weather_conditions_dict[weather_condition] = 'foggy'
        elif any(key in weather_condition_lowered for key in ['clear', 'sun']):
            weather_conditions_dict[weather_condition] = 'sunny'
            
    return weather_conditions_dict

In [None]:
weather_conditions_all = not_null_weather_measurements_df \
    .select(WEATHER_CONDITION_COL).distinct() \
    .to_koalas().to_numpy().reshape(-1)

In [None]:
weather_conditions_dict = get_weather_conditions_aggregation_dict(weather_conditions_all)

In [None]:
weather_measurements_aggregated_df = not_null_weather_measurements_df.replace(weather_conditions_dict)

In [None]:
WEATHER_CONDITIONS = set(weather_conditions_dict.values())

weather_measurements_aggregated_df = weather_measurements_aggregated_df \
    .filter(weather_measurements_aggregated_df[WEATHER_CONDITION_COL].isin(WEATHER_CONDITIONS))

In [None]:
if SLOW_OPERATIONS: weather_measurements_aggregated_df.groupBy(WEATHER_CONDITION_COL).count().show()

+-----------------+------+
|weather_condition| count|
+-----------------+------+
|            rainy|202725|
|            snowy| 21283|
|            sunny|641577|
|           cloudy|577842|
|     thunderstorm| 10852|
|            foggy|138707|
+-----------------+------+



In [None]:
def count_weather_condition_occurrences(dataframe: DataFrame, class_name: str) -> int:
    '''
    Args:
        - dataframe: a `DataFrame` which contains a column `WEATHER_CONDITION_COL`
        - class_name: the class name to count the occurences of
        
    Returns:
        the total number of `class_name` occurences inside `dataframe`
    '''
    return dataframe.filter(dataframe[WEATHER_CONDITION_COL] == class_name).count()

In [None]:
def get_undersampling_fracs(dataframe: DataFrame) -> Dict[str, float]:
    '''
    Args:
        - dataframe: a `DataFrame` of weather measurements which contains a column `WEATHER_CONDITION_COL`
        
    Returns:
        a dictionary that goes from a weather condition to its fraction
        that should be sampled in order to match the occurrences of the minority class
    '''

    rainy_cnt = count_weather_condition_occurrences(dataframe, 'rainy')
    snowy_cnt = count_weather_condition_occurrences(dataframe, 'snowy')
    sunny_cnt = count_weather_condition_occurrences(dataframe, 'sunny')
    foggy_cnt = count_weather_condition_occurrences(dataframe, 'foggy')
    cloudy_cnt = count_weather_condition_occurrences(dataframe, 'cloudy')
    thunderstorm_cnt = count_weather_condition_occurrences(dataframe, 'thunderstorm')

    minority_class_cnt = np.min(
        [rainy_cnt, snowy_cnt, sunny_cnt, cloudy_cnt, foggy_cnt, thunderstorm_cnt]
    )

    return {
        'rainy': minority_class_cnt / rainy_cnt if rainy_cnt != 0 else 0,
        'snowy': minority_class_cnt / snowy_cnt if snowy_cnt != 0 else 0,
        'sunny': minority_class_cnt / sunny_cnt if sunny_cnt != 0 else 0,
        'foggy': minority_class_cnt / foggy_cnt if foggy_cnt != 0 else 0,
        'cloudy': minority_class_cnt / cloudy_cnt if cloudy_cnt != 0 else 0,
        'thunderstorm': minority_class_cnt / thunderstorm_cnt if thunderstorm_cnt != 0 else 0
    }

In [None]:
sampled_weather_measurements_df = not_null_weather_measurements_df.sampleBy(WEATHER_CONDITION_COL,
                                                                           fractions=get_undersampling_fracs(not_null_weather_measurements_df),
                                                                           seed=RANDOM_SEED)
if LOAD_SAMPLED_DATASET:
    sampled_weather_measurements_df = spark.read.csv(SAMPLED_DATASET_PATH, header=True, inferSchema=True)

In [None]:
if SAVE_COMPUTATIONS and not LOAD_SAMPLED_DATASET:
    sampled_weather_measurements_df.write.csv(SAMPLED_DATASET_PATH,
                                              mode='overwrite',
                                              header=True)

In [None]:
sampled_weather_measurements_df.groupBy(WEATHER_CONDITION_COL).count().show()

+-----------------+-----+
|weather_condition|count|
+-----------------+-----+
|            rainy| 8373|
|            snowy| 8622|
|            sunny| 8656|
|           cloudy| 8644|
|     thunderstorm| 8553|
|            foggy| 8581|
+-----------------+-----+



In [None]:
sampled_weather_measurements_df.describe().to_koalas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
humidity,51429,73.35688035933033,20.633996204510503,5.0,100.0
pressure,51429,1017.105601897762,13.325849842166518,803.0,1100.0
temperature,51429,286.6298669477111,12.068790803267712,243.62,317.35
wind_direction,51429,183.87932878337125,103.6648132387239,0.0,360.0
wind_speed,51429,3.1757957572575783,2.291689346112425,0.0,35.0
weather_condition,51429,,,cloudy,thunderstorm
city,51429,,,Albuquerque,Vancouver
country,51429,,,Canada,United States
latitude,51429,37.886835537420296,5.830630652854522,25.774269,49.24966


In [None]:
train_df, test_df = sampled_weather_measurements_df.randomSplit([0.8, 0.2], seed=RANDOM_SEED)

In [None]:
# necessary due to DataBricks limits (training on a dataframe larger than this threshold causes an Internal Server Error)
train_df = train_df.limit(MAX_TRAIN_SIZE)

In [None]:
print(f'Train set size:  {train_df.count()} instances')
print(f'Test set size:   {test_df.count()} instances')

Train set size:  41326 instances
Test set size:   10103 instances
