In [None]:
import os 
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, TimestampType, IntegerType, DoubleType, LongType

In [None]:
# Creating Spark application
spark = SparkSession.builder.master('local[*]').appName('ML Model Development').getOrCreate()

# Setting AWS Access Credentials
spark.conf.set("fs.s3a.access.key", str(os.environ['AWS_ACCESS_KEY']))
spark.conf.set("fs.s3a.secret.key", str(os.environ['AWS_SECRET_ACCESS_KEY']))
spark.conf.set("fs.s3a.endpoint", "s3.amazonaws.com")

In [None]:
# Read merged final dataset from S3

DATASET_PATH = 's3://data228/final-data/*.parquet' "path"
df = spark.read.parquet(DATASET_PATH)
df.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- date: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- trip_duration: long (nullable = true)
 |-- weekend: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- dew: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- visibility: double (nullable = true)



In [None]:
# Importing required libraries for ML models development

from pyspark.ml.regression import LinearRegression, RandomForestRegressor, FMRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [None]:
# Assemble valid features for training

assembler = VectorAssembler(
    inputCols=['year', 'month', 'date', 'hour', 'passenger_count', 'trip_distance', 'trip_duration', 'PULocationID', 'DOLocationID', 'weekend', 'temp', 'dew', 'humidity', 'windspeed', 'visibility'],
    outputCol='features'
)

df = assembler.transform(df)

# Split data into training and test sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=2023)

# Path to save trained models
MODELS_PATH = "s3://data228/trained-models/"

### Linear Regression Model

In [None]:
# Initialize Regression model
lr = LinearRegression(featuresCol='features', labelCol='total_amount')

# Train model
lr_model = lr.fit(train_data)

# Make predictions
lr_predictions = lr_model.transform(test_data)

# Evaluate model
r2_evaluator = RegressionEvaluator(labelCol='total_amount', predictionCol='prediction', metricName='r2')
loss_evaluator = RegressionEvaluator(labelCol='total_amount', predictionCol='prediction', metricName='rmse')

# Calculate accuracy and RMSE loss
lr_loss = loss_evaluator.evaluate(lr_predictions)
lr_accuracy = r2_evaluator.evaluate(lr_predictions)

print(f"Linear Regressor R2 score: {lr_accuracy}")
print(f"Linear Regressor (RMSE): {lr_loss}")

# Save the model into S3 for future predictions
lr_model.write().overwrite().save(MODELS_PATH + "lr_model")


Linear Regressor R2 score: 0.9098464932059501
Linear Regressor (RMSE): 4.184220896046416


### Random Forest Regression Model

In [None]:
# Initialize Regression model
rf = RandomForestRegressor(featuresCol='features', labelCol='total_amount')

# Train model
rf_model = rf.fit(train_data)

# Make predictions
rf_predictions = rf_model.transform(test_data)

# Evaluate model
r2_evaluator = RegressionEvaluator(labelCol='total_amount', predictionCol='prediction', metricName='r2')
loss_evaluator = RegressionEvaluator(labelCol='total_amount', predictionCol='prediction', metricName='rmse')

# Calculate accuracy and RMSE loss
rf_accuracy = r2_evaluator.evaluate(rf_predictions)
rf_loss = loss_evaluator.evaluate(rf_predictions)

print(f"Random Forest Regressor R2 score: {rf_accuracy}")
print(f"Random Forest Regressor (RMSE): {rf_loss}")

# Save the model into S3 for future predictions
rf_model.write().overwrite().save(MODELS_PATH + "rf_model")


Random Forest Regressor R2 score: 0.8893802623505866
Random Forest Regressor (RMSE): 4.634891838980823


### Gradient Boosted Trees Regression Model

In [None]:
# Initialize Regression model
gbt = GBTRegressor(featuresCol='features', labelCol='total_amount')

# Train model
gbt_model = gbt.fit(train_data)

# Make predictions
gbt_predictions = gbt_model.transform(test_data)

# Evaluate model
r2_evaluator = RegressionEvaluator(labelCol='total_amount', predictionCol='prediction', metricName='r2')
loss_evaluator = RegressionEvaluator(labelCol='total_amount', predictionCol='prediction', metricName='rmse')

# Calculate accuracy and RMSE loss
gbt_loss = loss_evaluator.evaluate(gbt_predictions)
gbt_accuracy = r2_evaluator.evaluate(gbt_predictions)

print(f"GBT Regressor R2 score: {gbt_accuracy}")
print(f"GBT Regressor (RMSE): {gbt_loss}")

# Save the model into S3 for future predictions
gbt_model.write().overwrite().save(MODELS_PATH + "gbt_model")


GBT Regressor R2 score: 0.9167801313839348
GBT Regressor (RMSE): 4.02009952172111
