In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, udf, year, month, dayofmonth, dayofweek, datediff, to_date, regexp_replace, length, unix_timestamp, from_unixtime, log
from pyspark.sql.types import DoubleType

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, LinearRegression, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

import re
import matplotlib.pyplot as plt
from constants import TEST_TRANSFORMED_DATA, TRAIN_TRANSFORMED_DATA

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("CO2 Emission ML Pipeline - Modelling") \
    .getOrCreate()

#reading df
TEST_TRANSFORMED_DF = spark.read.parquet(TEST_TRANSFORMED_DATA)
TRAIN_TRANSFORMED_DF = spark.read.parquet(TRAIN_TRANSFORMED_DATA)


your 131072x1 screen size is bogus. expect trouble
23/11/10 00:10:28 WARN Utils: Your hostname, Kris resolves to a loopback address: 127.0.1.1; using 172.18.209.221 instead (on interface eth0)
23/11/10 00:10:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/10 00:10:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
# List of categorical columns 
categorical_columns = ["transport_mode", "project_location", "material_category", "supplier_location"]
# list of scaled columns
columns_to_scale = ["Quantity_Squared", "Distance_Covered_Squared", "Quantity_Distance_Interaction", "supplier_rating", "Transaction_Year", "Transaction_Month", "project_duration"]

#### Linear Regression


In [4]:
# initialize the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="log_CO2_emission")

# train model
lr_model = lr.fit(TRAIN_TRANSFORMED_DF)

# predict on the test data
lr_predictions = lr_model.transform(TEST_TRANSFORMED_DF)

# evaluate the model
lr_evaluator = RegressionEvaluator(labelCol="log_CO2_emission", predictionCol="prediction", metricName="rmse")
lr_rmse = lr_evaluator.evaluate(lr_predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {lr_rmse}")

lr_evaluator = RegressionEvaluator(labelCol="log_CO2_emission", predictionCol="prediction", metricName="r2")
lr_r2 = lr_evaluator.evaluate(lr_predictions)
print(f"R-squared on test data = {lr_r2}")


23/11/10 00:10:43 WARN Instrumentation: [272528e8] regParam is zero, which might cause numerical instability and overfitting.
23/11/10 00:10:44 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/11/10 00:10:44 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Root Mean Squared Error (RMSE) on test data = 1.0529085351593626
R-squared on test data = 0.3258366813389463


#### Gradient Boosted Trees

In [5]:
# initialize GBTRegressor
gbt = GBTRegressor(featuresCol="features", labelCol="log_CO2_emission")

# create parameter grid 
gbt_paramGrid = (ParamGridBuilder()
                 .addGrid(gbt.maxDepth, [5, 10])
                 .addGrid(gbt.maxBins, [16, 32])
                 .addGrid(gbt.maxIter, [10, 20])
                 .build())

# initialize evaluator with the appropriate metric - rmse
gbt_evaluator = RegressionEvaluator(
    labelCol="log_CO2_emission", predictionCol="prediction", metricName="rmse"
)

# 5-fold CrossValidator
gbt_cv = CrossValidator(estimator=gbt,
                        estimatorParamMaps=gbt_paramGrid,
                        evaluator=gbt_evaluator,
                        numFolds=3)

# run cross-validation
gbt_cv_model = gbt_cv.fit(TRAIN_TRANSFORMED_DF)

# predict on the test data
gbt_cv_predictions = gbt_cv_model.transform(TEST_TRANSFORMED_DF)

# evaluate the model
gbt_cv_rmse = gbt_evaluator.evaluate(gbt_cv_predictions)
print(f"Root Mean Squared Error (RMSE) on test data with CV = {gbt_cv_rmse}")

# To evaluate R-squared
gbt_evaluator.setMetricName("r2")
gbt_cv_r2 = gbt_evaluator.evaluate(gbt_cv_predictions)
print(f"R-squared on test data with CV = {gbt_cv_r2}")

23/11/10 00:10:47 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/11/10 00:11:38 WARN DAGScheduler: Broadcasting large task binary with size 1002.6 KiB
23/11/10 00:11:38 WARN DAGScheduler: Broadcasting large task binary with size 1011.9 KiB
23/11/10 00:11:39 WARN DAGScheduler: Broadcasting large task binary with size 1030.1 KiB
23/11/10 00:11:39 WARN DAGScheduler: Broadcasting large task binary with size 1063.2 KiB
23/11/10 00:11:39 WARN DAGScheduler: Broadcasting large task binary with size 1117.4 KiB
23/11/10 00:11:39 WARN DAGScheduler: Broadcasting large task binary with size 1116.8 KiB
23/11/10 00:11:39 WARN DAGScheduler: Broadcasting large task binary with size 1117.3 KiB
23/11/10 00:11:39 WARN DAGScheduler: Broadcasting large task binary with size 1118.0 KiB
23/11/10 00:11:40 WARN DAGScheduler: Broadcasting large task binary with size 1119.0 KiB
23/11/10 0

Root Mean Squared Error (RMSE) on test data with CV = 0.5543713730414987
R-squared on test data with CV = 0.8131102779928129


#### Random Forest Regressor

In [7]:
# initialize RandomForest regressor
rf = RandomForestRegressor(featuresCol="features", labelCol="log_CO2_emission")

# parameter grid 
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [10, 30])  # List of trees to test
             .addGrid(rf.maxDepth, [5, 10])    # List of maximum depths to test
             .addGrid(rf.maxBins, [32])        # List of bins to test
             .build())

# evaluator for the cross-validation
evaluator = RegressionEvaluator(labelCol="log_CO2_emission", predictionCol="prediction", metricName="rmse")

# crossValidator requires the same evaluator used to evaluate the model
cv = CrossValidator(estimator=rf,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)  # Number of folds for cross-validation

# run cross-validation
cv_model = cv.fit(TRAIN_TRANSFORMED_DF)

# Use the best model found to make predictions on the test data
cv_predictions = cv_model.transform(TEST_TRANSFORMED_DF)

# evaluate best model
cv_rmse = evaluator.evaluate(cv_predictions)
print(f"Root Mean Squared Error (RMSE) on CV test data = {cv_rmse}")

#R-squared eevaluation
evaluator.setMetricName("r2")
cv_r2 = evaluator.evaluate(cv_predictions)
print(f"R-squared on CV test data = {cv_r2}")

# Get best model
best_rf_model = cv_model.bestModel

23/11/10 00:15:18 WARN DAGScheduler: Broadcasting large task binary with size 1425.6 KiB
23/11/10 00:15:21 WARN DAGScheduler: Broadcasting large task binary with size 1321.7 KiB
23/11/10 00:15:22 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/11/10 00:15:23 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/11/10 00:15:27 WARN DAGScheduler: Broadcasting large task binary with size 1415.0 KiB
23/11/10 00:15:31 WARN DAGScheduler: Broadcasting large task binary with size 1322.0 KiB
23/11/10 00:15:31 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
23/11/10 00:15:32 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/11/10 00:15:36 WARN DAGScheduler: Broadcasting large task binary with size 1412.2 KiB
23/11/10 00:15:40 WARN DAGScheduler: Broadcasting large task binary with size 1298.5 KiB
23/11/10 00:15:40 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/11/10 00:15:41 WARN DAGScheduler:

Root Mean Squared Error (RMSE) on CV test data = 0.5544081963375848
R-squared on CV test data = 0.8130854494231449
