In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, udf, year, month, dayofmonth, dayofweek, datediff, to_date, regexp_replace, length, unix_timestamp, from_unixtime, log
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

import re
import matplotlib.pyplot as plt
from constants import TEST_TRANSFORMED_DATA, TRAIN_TRANSFORMED_DATA

In [None]:
spark = SparkSession.builder \
    .appName("CO2 Emission ML Pipeline - Modelling") \
    .getOrCreate()

#reading df
TEST_TRANSFORMED_DF = spark.read.parquet(TEST_TRANSFORMED_DATA)
TRAIN_TRANSFORMED_DF = spark.read.parquet(TRAIN_TRANSFORMED_DATA)


In [4]:
# List of categorical columns 
categorical_columns = ["transport_mode", "project_location", "material_category", "supplier_location"]
# list of scaled columns
columns_to_scale = ["Quantity_Squared", "Distance_Covered_Squared", "Quantity_Distance_Interaction", "supplier_rating", "Transaction_Year", "Transaction_Month", "project_duration"]

In [3]:
# Initialize the RandomForest regressor
rf = RandomForestRegressor(featuresCol="features", labelCol="log_CO2_emission")

# Train the model
rf_model = rf.fit(TRAIN_TRANSFORMED_DF)

# Predict on the test data
predictions = rf_model.transform(TEST_TRANSFORMED_DF)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="log_CO2_emission", predictionCol="prediction", metricName="rmse")

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")

# If you want to evaluate R-squared
evaluator = RegressionEvaluator(labelCol="log_CO2_emission", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print(f"R-squared on test data = {r2}")



Root Mean Squared Error (RMSE) on test data = 0.5783348695499914
R-squared on test data = 0.7966039182286223


In [5]:
# Get the feature importances
importances = rf_model.featureImportances.toArray()

# Start with an empty list for feature names
feature_names = []

# Add names for the one-hot encoded categorical features
# You need to know the number of categories in each categorical feature after one-hot encoding
for categoricalCol in categorical_columns:
    # Assuming we know the number of categories for each column (replace with actual number)
    num_categories = TRAIN_TRANSFORMED_DF.select(categoricalCol + "Vec").head()[0].size
    feature_names += [f"{categoricalCol}_{i}" for i in range(num_categories)]

# Add names for the scaled numerical features
feature_names += columns_to_scale

# Add the log-transformed features if they are also included
feature_names.append("log_project_budget")

# The length of feature names should now match the length of importances
assert len(feature_names) == len(importances), f"Length of feature names ({len(feature_names)}) does not match the number of importances ({len(importances)})"

# Now you can match the importances to the feature names
named_importances = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

# Print the feature importances
for name, importance in named_importances:
    print(f"{name}: {importance}")


Quantity_Distance_Interaction: 0.5573003371925371
Quantity_Squared: 0.25001058762081685
Distance_Covered_Squared: 0.18706150008736913
log_project_budget: 0.0014752473940918102
Transaction_Month: 0.0009841259912263885
supplier_rating: 0.0007567558169999477
transport_mode_1: 0.0004239272269053222
Transaction_Year: 0.00034582762793127414
project_duration: 0.0002992808920306051
material_category_1: 0.00022148226658130296
project_location_0: 0.00020682231472337847
material_category_0: 0.00020117222781278294
project_location_2: 0.00018023740692027566
project_location_1: 0.0001580647951555075
supplier_location_1: 0.00014353005609024855
transport_mode_0: 0.0001341647091086035
supplier_location_0: 9.693637369927284e-05
