In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, udf, year, month, dayofmonth, dayofweek, datediff, to_date, regexp_replace, length, unix_timestamp, from_unixtime, log
from pyspark.sql.types import DoubleType

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StandardScaler, 
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, LinearRegression, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

import re
import matplotlib.pyplot as plt
import pandas as pd
from constants import TEST_TRANSFORMED_DATA, TRAIN_TRANSFORMED_DATA

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("CO2 Emission ML Pipeline - Modelling") \
    .getOrCreate()

#reading df
TEST_TRANSFORMED_DF = spark.read.parquet(TEST_TRANSFORMED_DATA)
TRAIN_TRANSFORMED_DF = spark.read.parquet(TRAIN_TRANSFORMED_DATA)


your 131072x1 screen size is bogus. expect trouble
23/11/13 23:42:05 WARN Utils: Your hostname, Kris resolves to a loopback address: 127.0.1.1; using 172.18.209.221 instead (on interface eth0)
23/11/13 23:42:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/13 23:42:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
# List of categorical columns 
categorical_columns = ["transport_mode", "project_location", "material_category", "supplier_location"]
# list of scaled columns
columns_to_scale = ["Quantity_Squared", "Distance_Covered_Squared", "Quantity_Distance_Interaction", "supplier_rating", "Transaction_Year", "Transaction_Month", "project_duration"]

#### Linear Regression


In [4]:
# initialize the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="log_CO2_emission")

# train model
lr_model = lr.fit(TRAIN_TRANSFORMED_DF)

# predict on the test data
lr_predictions = lr_model.transform(TEST_TRANSFORMED_DF)

# evaluate the model
lr_evaluator = RegressionEvaluator(labelCol="log_CO2_emission", predictionCol="prediction", metricName="rmse")
lr_rmse = lr_evaluator.evaluate(lr_predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {lr_rmse}")

lr_evaluator = RegressionEvaluator(labelCol="log_CO2_emission", predictionCol="prediction", metricName="r2")
lr_r2 = lr_evaluator.evaluate(lr_predictions)
print(f"R-squared on test data = {lr_r2}")


23/11/13 23:42:21 WARN Instrumentation: [cffed22e] regParam is zero, which might cause numerical instability and overfitting.
23/11/13 23:42:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/11/13 23:42:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Root Mean Squared Error (RMSE) on test data = 1.0529085351593626
R-squared on test data = 0.3258366813389463


#### Gradient Boosted Trees

In [8]:
# initialize GBTRegressor
gbt = GBTRegressor(featuresCol="features", labelCol="log_CO2_emission")

# create parameter grid 
gbt_paramGrid = (ParamGridBuilder()
                 .addGrid(gbt.maxDepth, [5, 10])
                 .addGrid(gbt.maxBins, [16])
                 .addGrid(gbt.maxIter, [10])
                 .build())

# initialize evaluator with the appropriate metric - rmse
gbt_evaluator = RegressionEvaluator(
    labelCol="log_CO2_emission", predictionCol="prediction", metricName="rmse"
)

# 5-fold CrossValidator
gbt_cv = CrossValidator(estimator=gbt,
                        estimatorParamMaps=gbt_paramGrid,
                        evaluator=gbt_evaluator,
                        numFolds=3)

# run cross-validation
gbt_cv_model = gbt_cv.fit(TRAIN_TRANSFORMED_DF)

# predict on the test data
gbt_cv_predictions = gbt_cv_model.transform(TEST_TRANSFORMED_DF)

# evaluate the model
gbt_cv_rmse = gbt_evaluator.evaluate(gbt_cv_predictions)
print(f"Root Mean Squared Error (RMSE) on test data with CV = {gbt_cv_rmse}")

# To evaluate R-squared
gbt_evaluator.setMetricName("r2")
gbt_cv_r2 = gbt_evaluator.evaluate(gbt_cv_predictions)
print(f"R-squared on test data with CV = {gbt_cv_r2}")

23/11/13 23:45:07 WARN DAGScheduler: Broadcasting large task binary with size 1013.0 KiB
23/11/13 23:45:07 WARN DAGScheduler: Broadcasting large task binary with size 1064.1 KiB
23/11/13 23:45:07 WARN DAGScheduler: Broadcasting large task binary with size 1063.7 KiB
23/11/13 23:45:07 WARN DAGScheduler: Broadcasting large task binary with size 1064.1 KiB
23/11/13 23:45:07 WARN DAGScheduler: Broadcasting large task binary with size 1064.8 KiB
23/11/13 23:45:07 WARN DAGScheduler: Broadcasting large task binary with size 1065.9 KiB
23/11/13 23:45:07 WARN DAGScheduler: Broadcasting large task binary with size 1068.1 KiB
23/11/13 23:45:08 WARN DAGScheduler: Broadcasting large task binary with size 1072.8 KiB
23/11/13 23:45:08 WARN DAGScheduler: Broadcasting large task binary with size 1081.3 KiB
23/11/13 23:45:08 WARN DAGScheduler: Broadcasting large task binary with size 1097.9 KiB
23/11/13 23:45:08 WARN DAGScheduler: Broadcasting large task binary with size 1126.5 KiB
23/11/13 23:45:08 WAR

Root Mean Squared Error (RMSE) on test data with CV = 0.5764742370042398
R-squared on test data with CV = 0.7979105542889521


#### Random Forest Regressor

In [5]:
# initialize RandomForest regressor
rf = RandomForestRegressor(featuresCol="features", labelCol="log_CO2_emission")

# parameter grid 
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [10, 30])  # List of trees to test
             .addGrid(rf.maxDepth, [5, 10])    # List of maximum depths to test
             .addGrid(rf.maxBins, [32])        # List of bins to test
             .build())

# evaluator for the cross-validation
evaluator = RegressionEvaluator(labelCol="log_CO2_emission", predictionCol="prediction", metricName="rmse")

# crossValidator requires the same evaluator used to evaluate the model
cv = CrossValidator(estimator=rf,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)  # Number of folds for cross-validation

# run cross-validation
cv_model = cv.fit(TRAIN_TRANSFORMED_DF)

# Use the best model found to make predictions on the test data
cv_predictions = cv_model.transform(TEST_TRANSFORMED_DF)

# evaluate best model
cv_rmse = evaluator.evaluate(cv_predictions)
print(f"Root Mean Squared Error (RMSE) on CV test data = {cv_rmse}")

#R-squared eevaluation
evaluator.setMetricName("r2")
cv_r2 = evaluator.evaluate(cv_predictions)
print(f"R-squared on CV test data = {cv_r2}")

# Get best model
best_rf_model = cv_model.bestModel

23/11/13 23:42:24 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/11/13 23:42:35 WARN DAGScheduler: Broadcasting large task binary with size 1393.3 KiB
23/11/13 23:42:40 WARN DAGScheduler: Broadcasting large task binary with size 1303.6 KiB
23/11/13 23:42:41 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/11/13 23:42:42 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
23/11/13 23:42:49 WARN DAGScheduler: Broadcasting large task binary with size 1381.6 KiB
23/11/13 23:42:54 WARN DAGScheduler: Broadcasting large task binary with size 1306.0 KiB
23/11/13 23:42:55 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/11/13 23:42:56 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
23/11/13 23:43:02 WARN DAGScheduler: Broadcasting large task binary with size 1399.9 KiB
23/11/13 23:43:06 WARN

Root Mean Squared Error (RMSE) on CV test data = 0.5538786257349492
R-squared on CV test data = 0.8134423603666683


#### Training Outcome
The results show that both the Gradient Boosted Trees (GBT) and Random Forest Regressor models have performed significantly better than the Linear Regression model in terms of both RMSE and R-squared. The RMSE is lower for the GBT and Random Forest models, indicating better accuracy, and the R-squared values are significantly higher, suggesting that these models explain a much greater proportion of the variance in the data.

#### Feature importance Analysis
Analyzing feature importance is a crucial step in understanding and interpreting the model.

##### Random Forest Feature Importance
I'm taking Random Forest Regressor as my model of choice
With PySpark we can use the attribute 'featureImportances' 

In [10]:
# extract feature importances
rf_feature_importances = best_rf_model.featureImportances.toArray()

# create a pandas df for easier visualization
features_df = pd.DataFrame(list(zip(TRAIN_TRANSFORMED_DF.columns, rf_feature_importances)),
                           columns=["feature", "importance"]).sort_values(by="importance", ascending=False)


In [11]:
features_df

Unnamed: 0,feature,importance
11,project_end_date,0.616706
9,project_name,0.195949
10,project_start_date,0.123666
16,supplier_name,0.014118
14,material_name,0.011746
12,project_location,0.006701
15,material_category,0.005179
13,supplier_id,0.004366
3,transaction_id,0.002889
0,material_id,0.002867


In [13]:
# Define the regression model
rf = RandomForestRegressor(featuresCol='features', labelCol='log_CO2_emission')

# Train the model
rf_model = rf.fit(TRAIN_TRANSFORMED_DF)

# Make predictions on the test data
predictions = rf_model.transform(TEST_TRANSFORMED_DF)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="log_CO2_emission", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# You can save the trained model for later use or for deployment
# rf_model.save("path_to_save_model")


Root Mean Squared Error (RMSE) on test data = 0.573318


In [20]:
# extract feature importances
rf_feature_importancesss = rf_model.featureImportances.toArray()

# create a pandas df for easier visualization
features_dfff = pd.DataFrame(list(zip(TRAIN_TRANSFORMED_DF.columns, rf_feature_importancesss)),
                           columns=["feature", "importance"]).sort_values(by="importance", ascending=False)

features_dfff

Unnamed: 0,feature,importance
11,project_end_date,0.676418
9,project_name,0.218012
10,project_start_date,0.100544
16,supplier_name,0.001156
14,material_name,0.000815
12,project_location,0.000598
15,material_category,0.000492
2,project_budget,0.000351
13,supplier_id,0.000346
0,material_id,0.000309
