In [3]:
from pyspark.sql import SparkSession

from pyspark.ml.regression import RandomForestRegressor, LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

from constants import TEST_TRANSFORMED_DATA, TRAIN_TRANSFORMED_DATA, MODELS, RANDOM_FOREST_MODEL

In [4]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("CO2 Emission ML Pipeline - Modelling") \
    .getOrCreate()

#reading df
TEST_TRANSFORMED_DF = spark.read.parquet(TEST_TRANSFORMED_DATA)
TRAIN_TRANSFORMED_DF = spark.read.parquet(TRAIN_TRANSFORMED_DATA)


your 131072x1 screen size is bogus. expect trouble
23/11/18 00:12:01 WARN Utils: Your hostname, Kris resolves to a loopback address: 127.0.1.1; using 172.18.209.221 instead (on interface eth0)
23/11/18 00:12:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/18 00:12:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [5]:
# List of categorical columns 
categorical_columns = ["transport_mode", "project_location", "material_category", "supplier_location"]
# list of scaled columns
columns_to_scale = ["Quantity_Squared", "Distance_Covered_Squared", "Quantity_Distance_Interaction", "supplier_rating", "Transaction_Year", "Transaction_Month", "project_duration"]

#### Linear Regression


In [6]:
# initialize the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="log_CO2_emission")

# train model
lr_model = lr.fit(TRAIN_TRANSFORMED_DF)

# predict on the test data
lr_predictions = lr_model.transform(TEST_TRANSFORMED_DF)

# evaluate the model
lr_evaluator = RegressionEvaluator(labelCol="log_CO2_emission", predictionCol="prediction", metricName="rmse")
lr_rmse = lr_evaluator.evaluate(lr_predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {lr_rmse}")

lr_evaluator = RegressionEvaluator(labelCol="log_CO2_emission", predictionCol="prediction", metricName="r2")
lr_r2 = lr_evaluator.evaluate(lr_predictions)
print(f"R-squared on test data = {lr_r2}")


23/11/18 00:12:17 WARN Instrumentation: [cdc5c87c] regParam is zero, which might cause numerical instability and overfitting.
23/11/18 00:12:19 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/11/18 00:12:19 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Root Mean Squared Error (RMSE) on test data = 1.0529085351593626
R-squared on test data = 0.3258366813389463


In [7]:
# coefficients represent the change in the response variable for a one-unit change in the respective feature, assuming all other features remain constant.
# coefficients are sort of an indicator for feature importance
print(lr_model.coefficients)

[-0.026265564908581642,-0.023614727314001275,0.014240011875558171,-0.009430261892876688,-0.03693966286356247,0.0008391584442805165,-0.007710394896382353,-0.01580567302404461,0.005050603664929159,0.43910718992227366,-0.42599632305299817,0.6839255190338015,-0.003597240585000929,-0.014830168210552427,-0.0106627065498407,-0.009004012399024564,0.0011711854383697421]


The regression model is underftting. It is not capturing the data patterns; with an r2 of 32% it means that much of the variance in the target variable is not captured by the model. Also, high RMSE indicates poor odel performance.
In such cases, it's useful to revisit the data as a whole and its features. Regularisation here is pointless because the model is weak and not complex enough for regularisation to help in any way. In fact, regularisation is used to prevent overfitting.

Regularisation L1 (Lasso): helps with feature selection and effectively removes certain features. Likely that only subset of features is important. It adds a penalty equal to the absolute value of the magnitude of the model coefficients.

Regularisation L2 (Ridge Regression): this adds a penalty equal to the square of the magnitude of coefficients. Unlike L1 it does not reduce coeffiecnts to zero but it minimises their impact. This is usefult for overfitting models and in cases of multicollinearity.

The penalties are added to the loss function during training. The loss function is used to quanitfy how well the model is performing in terms of making predictions compared to the actual data. It is the difference between the model's prediciton and the actual data. The goal is to minimise this loff during model's training. There are different types of loss functions depending on what problem we dealing with. MSE for regression or Binary Cross entropy for binary classification for example

#### Gradient Boosted Trees

In [5]:
# # initialize GBTRegressor
# gbt = GBTRegressor(featuresCol="features", labelCol="log_CO2_emission")

# # create parameter grid 
# gbt_paramGrid = (ParamGridBuilder()
#                  .addGrid(gbt.maxDepth, [5, 10])
#                  .addGrid(gbt.maxBins, [16])
#                  .addGrid(gbt.maxIter, [10])
#                  .build())

# # initialize evaluator with the appropriate metric - rmse
# gbt_evaluator = RegressionEvaluator(
#     labelCol="log_CO2_emission", predictionCol="prediction", metricName="rmse"
# )

# # 5-fold CrossValidator
# gbt_cv = CrossValidator(estimator=gbt,
#                         estimatorParamMaps=gbt_paramGrid,
#                         evaluator=gbt_evaluator,
#                         numFolds=3)

# # run cross-validation
# gbt_cv_model = gbt_cv.fit(TRAIN_TRANSFORMED_DF)

# # predict on the test data
# gbt_cv_predictions = gbt_cv_model.transform(TEST_TRANSFORMED_DF)

# # evaluate the model
# gbt_cv_rmse = gbt_evaluator.evaluate(gbt_cv_predictions)
# print(f"Root Mean Squared Error (RMSE) on test data with CV = {gbt_cv_rmse}")

# # To evaluate R-squared
# gbt_evaluator.setMetricName("r2")
# gbt_cv_r2 = gbt_evaluator.evaluate(gbt_cv_predictions)
# print(f"R-squared on test data with CV = {gbt_cv_r2}")

#### Random Forest Regressor

In [6]:
# initialize RandomForest regressor
rf = RandomForestRegressor(featuresCol="features", labelCol="log_CO2_emission")

# parameter grid 
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [10, 30])  # List of trees to test
             .addGrid(rf.maxDepth, [5, 10])    # List of maximum depths to test
             .addGrid(rf.maxBins, [32])        # List of bins to test
             .build())

# evaluator for the cross-validation
evaluator = RegressionEvaluator(labelCol="log_CO2_emission", predictionCol="prediction", metricName="rmse")

# crossValidator requires the same evaluator used to evaluate the model
cv = CrossValidator(estimator=rf,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)  # Number of folds for cross-validation

# run cross-validation
cv_model = cv.fit(TRAIN_TRANSFORMED_DF)

# Use the best model found to make predictions on the test data
cv_predictions = cv_model.transform(TEST_TRANSFORMED_DF)

# evaluate best model
cv_rmse = evaluator.evaluate(cv_predictions)
print(f"Root Mean Squared Error (RMSE) on CV test data = {cv_rmse}")

#R-squared eevaluation
evaluator.setMetricName("r2")
cv_r2 = evaluator.evaluate(cv_predictions)
print(f"R-squared on CV test data = {cv_r2}")

# Get best model
best_rf_model = cv_model.bestModel

23/11/14 17:47:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/11/14 17:47:25 WARN DAGScheduler: Broadcasting large task binary with size 1447.3 KiB
23/11/14 17:47:31 WARN DAGScheduler: Broadcasting large task binary with size 1313.1 KiB
23/11/14 17:47:32 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/11/14 17:47:34 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/11/14 17:47:42 WARN DAGScheduler: Broadcasting large task binary with size 1370.9 KiB
23/11/14 17:47:47 WARN DAGScheduler: Broadcasting large task binary with size 1305.3 KiB
23/11/14 17:47:48 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/11/14 17:47:49 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
23/11/14 17:47:56 WARN DAGScheduler: Broadcasting large task binary with size 1408.0 KiB
23/11/14 17:48:01 WARN

Root Mean Squared Error (RMSE) on CV test data = 0.5571811882243928
R-squared on CV test data = 0.8112109866661488


#### Training Outcome
The results show that both the Gradient Boosted Trees (GBT) and Random Forest Regressor models have performed significantly better than the Linear Regression model in terms of both RMSE and R-squared. The RMSE is lower for the GBT and Random Forest models, indicating better accuracy, and the R-squared values are significantly higher, suggesting that these models explain a much greater proportion of the variance in the data.

#### Feature importance Analysis
Analyzing feature importance is a crucial step in understanding and interpreting the model.

##### Random Forest Feature Importance
I'm taking Random Forest Regressor as my model of choice.

With PySpark we can use the attribute 'featureImportances' 

In [7]:
importances = best_rf_model.featureImportances.toArray()

feature_names = []

# add names for the one-hot encoded categorical features
for categoricalCol in categorical_columns:
    num_categories = TRAIN_TRANSFORMED_DF.select(categoricalCol + "Vec").head()[0].size
    feature_names += [f"{categoricalCol}_{i}" for i in range(num_categories)]

# ddd names for the scaled numerical features
feature_names += columns_to_scale

# add the log-transformed features if they are also included
feature_names.append("log_project_budget")

# length of feature names needs to match the length of importances so we are assertign for that
assert len(feature_names) == len(importances), f"Length of feature names ({len(feature_names)}) does not match the number of importances ({len(importances)})"

# match the importances to the feature names
named_importances = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

# feature importances
for name, importance in named_importances:
    print(f"{name}: {importance}")


Quantity_Distance_Interaction: 0.5940227020648124
Quantity_Squared: 0.21326108274215863
Distance_Covered_Squared: 0.12943692727275058
log_project_budget: 0.01432131097945496
Transaction_Month: 0.011764418408622574
supplier_rating: 0.0065221628945606415
project_duration: 0.005466251565921377
Transaction_Year: 0.004292365452251261
transport_mode_1: 0.002864327503181511
transport_mode_0: 0.002478693954443479
supplier_location_0: 0.002446690729135823
project_location_0: 0.0024037122348859203
material_category_1: 0.002297656687177717
supplier_location_1: 0.002151430825210719
project_location_2: 0.002124007750478971
material_category_0: 0.0020896908323950144
project_location_1: 0.0020565681025581637


- quantity_distance_interaction (0.59): Most influential. Indicates combined impact of quantity and distance on predictions
- quantity_squared (0.213): shows non-linear relationship of quantity with the target
- distance_covered_squared (0.129): shows non-linear effects of distance
- log_project_budget (0.014): Modest impact, capturing budget scale effects
- transaction_month (0.011): Indicates minor seasonal trends

Remaining features have very minimal influence on the model.

- given these importances, we can improve the model in future to train on less attributes. For now, I'll carry forward the random forest model to deployment.

In [8]:
# saving my random forest regressor model
best_rf_model.write().overwrite().save(RANDOM_FOREST_MODEL)

                                                                                