# Import libraries

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

import findspark
findspark.init()
findspark.find() 

import databricks.koalas as ks
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, DecisionTreeClassifier, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import SparkSession



In [2]:
DATASET_PATH = 'datasets/historical-hourly-weather-dataset/'
AGGREGATED_DATASET_PATH = 'datasets/historical-hourly-weather-dataset/aggregated_sampled_weather_measurements'

# Machine Learning pipeline

### Load data

In [4]:
# Get all the csv files in the aggregated dataset folder
csv_files = [file for file in os.listdir(AGGREGATED_DATASET_PATH) if file.endswith('.csv')]

# Read each CSV file into a Koalas DataFrame and store them in a list
dfs = [ks.read_csv(os.path.join(AGGREGATED_DATASET_PATH, file)) for file in csv_files]

# Combine the DataFrames using the concat function
data = ks.concat(dfs, ignore_index = True)

### Pre-processing

Select relevant features and label column

In [5]:
# Select relevant features
numerical_cols = [
    'humidity',
    'pressure',
    'temperature',
    'wind_direction',
    'wind_speed',
    'latitude',
    'longitude'
]
nominal_cols = []
# Select the label column
label_col = 'weather_condition'
# Select the features and the label
df_selected = data[numerical_cols + [label_col]]

In [6]:
spark = SparkSession.builder.getOrCreate()
df_selected = df_selected.to_spark()

Train-Test split

In [7]:
train_data, test_data = df_selected.randomSplit([0.8, 0.2], seed = 42)

Encode

In [None]:
def encode(
    df,
    numerical_cols = [],
    nominal_cols = [],
    label_col = '',
    with_std = True,
    with_mean = True,
):
    # Convert categorical label to numerical label
    label_indexer = StringIndexer(
        inputCol = label_col,
        outputCol = 'label',
        handleInvalid = 'keep'
    )
    
    # Assemble features into a vector
    feature_cols = numerical_cols + nominal_cols
    vector_assembler = VectorAssembler(
        inputCols = feature_cols,
        outputCol = 'raw_features'
    )
    
    # Scale the features
    scaler = StandardScaler(
        inputCol = 'raw_features',
        outputCol = 'scaled_features',
        withStd = with_std,
        withMean = with_mean
    )
    
    stages = [label_indexer, vector_assembler, scaler]
    pipeline = Pipeline(stages = stages)
    
    transformer = pipeline.fit(df)
    
    return transformer

data_encoder = encode(
    df = df_selected,
    numerical_cols = numerical_cols,
    nominal_cols = nominal_cols,
    label_col = label_col
)

# Start training

### Random Forest

Define the classifier

In [9]:
classifier = RandomForestClassifier(
    featuresCol = 'scaled_features',
    labelCol = 'label',
    numTrees = 30
)

Define the pipeline with the encoding and classifier stages

In [10]:
pipeline = Pipeline(stages = [data_encoder, classifier])

Define the evaluator

In [11]:
evaluator = MulticlassClassificationEvaluator(
    labelCol = 'label',
    predictionCol = 'prediction',
    metricName = 'accuracy'
)

Define hyperparameter tuning (optional)

In [12]:
# Define the grid of hyperparameters
param_grid = ParamGridBuilder().build()

# Set up the cross validator for model training and hyperparameter tuning
cross_validator = CrossValidator(
    estimator = pipeline,
    estimatorParamMaps = param_grid,
    evaluator = evaluator,
    numFolds = 5
)

Fit the model using the training data

In [13]:
model = cross_validator.fit(train_data)

                                                                                

Make predictions on the test data

In [14]:
predictions = model.transform(test_data)

Evaluate the model performance

In [15]:
accuracy = evaluator.evaluate(predictions)
print('Accuracy:', accuracy)

Accuracy: 0.5201468836840016


### Logistic Regression

Define the classifier

In [17]:
classifier = LogisticRegression(
    featuresCol = 'scaled_features',
    labelCol = 'label',
)

Define the pipeline with the encoding and classifier stages

In [18]:
pipeline = Pipeline(stages = [data_encoder, classifier])

Define the evaluator

In [19]:
evaluator = MulticlassClassificationEvaluator(
    labelCol = 'label',
    predictionCol = 'prediction',
    metricName = 'accuracy'
)

Define hyperparameter tuning (optional)

In [20]:
# Define the grid of hyperparameters
param_grid = ParamGridBuilder().build()

# Set up the cross validator for model training and hyperparameter tuning
cross_validator = CrossValidator(
    estimator = pipeline,
    estimatorParamMaps = param_grid,
    evaluator = evaluator,
    numFolds = 5
)

Fit the model using the training data

In [21]:
model = cross_validator.fit(train_data)

23/11/10 21:47:03 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


Make predictions on the test data

In [22]:
predictions = model.transform(test_data)

Evaluate the model performance

In [23]:
accuracy = evaluator.evaluate(predictions)
print('Accuracy:', accuracy)

Accuracy: 0.48947995236204844


### Decision Tree

Define the classifier

In [25]:
classifier = DecisionTreeClassifier(
    featuresCol = 'scaled_features',
    labelCol = 'label',
)

Define the pipeline with the encoding and classifier stages

In [26]:
pipeline = Pipeline(stages = [data_encoder, classifier])

Define the evaluator

In [27]:
evaluator = MulticlassClassificationEvaluator(
    labelCol = 'label',
    predictionCol = 'prediction',
    metricName = 'accuracy'
)

Define hyperparameter tuning (optional)

In [28]:
# Define the grid of hyperparameters
param_grid = ParamGridBuilder().build()

# Set up the cross validator for model training and hyperparameter tuning
cross_validator = CrossValidator(
    estimator = pipeline,
    estimatorParamMaps = param_grid,
    evaluator = evaluator,
    numFolds = 5
)

Fit the model using the training data

In [29]:
model = cross_validator.fit(train_data)

Make predictions on the test data

In [30]:
predictions = model.transform(test_data)

Evaluate the model performance

In [31]:
accuracy = evaluator.evaluate(predictions)
print('Accuracy:', accuracy)

Accuracy: 0.508634378721715


### Multilayer Perceptron

Define the layers of the neural network

In [41]:
layers = [len(numerical_cols) + len(nominal_cols), 32, 64, 128, 6]

Define the classifier

In [49]:
classifier = MultilayerPerceptronClassifier(
    layers = layers,
    blockSize = 128,
    seed = 42,
    featuresCol = 'scaled_features',
    labelCol = 'label',
    maxIter = 500
)

Define the pipeline with the encoding and classifier stages

In [50]:
pipeline = Pipeline(stages = [data_encoder, classifier])

Define the evaluator

In [51]:
evaluator = MulticlassClassificationEvaluator(
    labelCol = 'label',
    predictionCol = 'prediction',
    metricName = 'accuracy'
)

Define hyperparameter tuning (optional)

In [52]:
# Define the grid of hyperparameters
param_grid = ParamGridBuilder().build()

# Set up the cross validator for model training and hyperparameter tuning
cross_validator = CrossValidator(
    estimator = pipeline,
    estimatorParamMaps = param_grid,
    evaluator = evaluator,
    numFolds = 5
)

Fit the model using the training data

In [53]:
model = cross_validator.fit(train_data)

Make predictions on the test data

In [54]:
predictions = model.transform(test_data)

Evaluate the model performance

In [55]:
accuracy = evaluator.evaluate(predictions)
print('Accuracy:', accuracy)

Accuracy: 0.556570067487098
